/*  atan2f.s -- atan2f for standard math library.

    Written by Eric Postpischil, July 2007.
*/


    .literal8

// Define miscellaneous constants.

Threshold:  .double 2.384185791015625e-07       // 2**-22.
nZero:      .double -0

pPi1v4:     .double +.7853981633974483096156608 // 1/4 pi.
nPi1v4:     .double -.7853981633974483096156608
pPi1v2:     .double +1.570796326794896619231322 // 1/2 pi.
nPi1v2:     .double -1.570796326794896619231322
pPi3v4:     .double +2.356194490192344928846982 // 3/4 pi.
nPi3v4:     .double -2.356194490192344928846982

/*  Define values near +/- pi that yield +/- pi rounded toward zero when
    converted to single precision.  This allows us to generate inexact and
    return the desired values for atan2f on and near the negative side of the x
    axis.
*/
AlmostpPi:  .double +3.1415924

// Define a coefficient for center polynomial (used for x in [-1, +1]).
C2:         .double  0.0029352921857004596570518


    .const
    .align  4

/*  Define some coefficients for center polynomial (used for x in [-1, +1]).
    These are stored in pairs at aligned addresses for use in SIMD
    instructions.
*/
C01:        .double  2.2971562298314633761676433,  0.0207432003007420961489920
C00:        .double  2.4449692297316409651126041,  3.7888879287802702842997915
C11:        .double -2.9466967515109826289085300, -4.9721072376211623038916292
C10:        .double  5.4728447324456990092824269,  6.7197076223592378022736307

// This needs to be 16-byte aligned because it is used in an orpd instruction.
    .align  4
pPi:        .double +3.141592653589793238462643 // pi.


// Rename the general registers (just to make it easier to keep track of them).
#if defined __i386__
    #define r0  %eax
    #define r1  %ecx
    #define r2  %edx
    #define r3  %ebx
    #define r4  %esp
    #define r5  %ebp
    #define r6  %esi
    #define r7  %edi
#elif defined __x86_64__
    #define r0  %rax
    #define r1  %rcx
    #define r2  %rdx
    #define r3  %rbx
    #define r4  %rsp
    #define r5  %rbp
    #define r6  %rsi
    #define r7  %rdi
#else
    #error "Unknown architecture."
#endif


    .text


// Define various symbols.

#define BaseP       r0      // Base address for position-independent addressing.

#define y           %xmm0   // Must be in %xmm0 for return on x86_64.
#define x           %xmm1
#define p0          %xmm2
#define x1          %xmm3
#define t0          %xmm4
#define Base        %xmm5
#define p1          %xmm6

#if defined __i386__

    // Define locations of arguments.
    #define Argy            4(%esp)
    #define Argx            8(%esp)

    // Define how to address data.  BaseP must contain the address of label 0.
    #define Address(label)  label-0b(BaseP)

#elif defined __x86_64__

    // Define locations of arguments.
    #define Argx            %xmm1
    #define Argy            %xmm0

    // Define how to address data.
    #define Address(label)  label(%rip)

#endif


/*  float atan2f(float x).

    Notes:

        This routine has not been proven to be correct.  See the notes in the
        accompanying C version regarding potential proof.  The polynomial it
        uses was proven to provide faithfully rounded results in atanf.  atan2f
        introduces additional error performing the division and additional
        points used in the domain of the polynomial.

        Citations in parentheses below indicate the source of a requirement.

        "C" stands for ISO/IEC 9899:TC2.

        The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
        requirements since it defers to C and requires errno behavior only if
        we choose to support it by arranging for "math_errhandling &
        MATH_ERRNO" to be non-zero, which we do not.

    Return value for atan2f(y, x) (C F.9.1 12 and C F.9.1.4):

        y           x           atan2f(y, x)

        -infinity   -infinity   -3*pi/4.
                    < 0         -2*pi/4.
                    -0          -2*pi/4.
                    +0          -2*pi/4.
                    > 0         -2*pi/4.
                    +infinity   -1*pi/4.

        < 0         -infinity   -4*pi/4.
                    < 0         arctangent(y/x) in [-4*pi/4, -2*pi/4].
                    -0          -2*pi/4.
                    +0          -2*pi/4.
                    > 0         arctangent(y/x) in [-2*pi/4, -0*pi/4].
                    +infinity   -0*pi/4.

        -0          -infinity   -4*pi/4.
                    < 0         -4*pi/4.
                    -0          -4*pi/4.
                    +0          -0*pi/4.
                    > 0         -0*pi/4.
                    +infinity   -0*pi/4.

        +0          -infinity   +4*pi/4.
                    < 0         +4*pi/4.
                    -0          +4*pi/4.
                    +0          +0*pi/4.
                    > 0         +0*pi/4.
                    +infinity   +0*pi/4.

        > 0         -infinity   +4*pi/4.
                    < 0         arctangent(y/x) in [+2*pi/4, +4*pi/4].
                    -0          +2*pi/4.
                    +0          +2*pi/4.
                    > 0         arctangent(y/x) in [+0*pi/4, +2*pi/4].
                    +infinity   +0*pi/4.

        +infinity   -infinity   +3*pi/4.
                    < 0         +2*pi/4.
                    -0          +2*pi/4.
                    +0          +2*pi/4.
                    > 0         +2*pi/4.
                    +infinity   +1*pi/4.

        If either input is a NaN, return one of the NaNs in the input.  (C F.9
        11 and 13).  (If the NaN is a signalling NaN, we return the "same" NaN
        quieted.)

        Otherwise:

            If the rounding mode is round-to-nearest, return arctangent(x)
            faithfully rounded.

            Returns a value in [-pi, +pi] (C 7.12.4.4 3).  Note that this
            prohibits returning correctly rounded values for -pi and +pi, since
            pi rounded to a float lies outside that interval.
        
            Not implemented:  In other rounding modes, return arctangent(x)
            possibly with slightly worse error, not necessarily honoring the
            rounding mode (Ali Sazegari narrowing C F.9 10).

    Exceptions:

        Raise underflow for a denormal result (C F.9 7 and Draft Standard for
        Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
        smallest normal, underflow may or may not be raised.  This is stricter
        than the older 754 standard.

        May or may not raise inexact, even if the result is exact (C F.9 8).

        Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
        of C 4.2.1)  but not if the input is a quiet NaN (C F.9 11).

        May not raise exceptions otherwise (C F.9 9).

    Properties:

        We desire this routine to be monotonic, but that has not
        been proven.  (For atan2f, monotonicity would mean, if (x0, y0) and
        (x1, y1) are in the same quadrant, then y0/x0 <= y1/x1 implies
        atan2f(y0, x0) <= atan2f(y1, x1).)
*/
    .align  5
    .globl _atan2f
_atan2f:

    cvtss2sd    Argy, y                 // Convert to double precision.
    cvtss2sd    Argx, x

    #if defined __i386__

        // Get address of 0 in BaseP.
            call    0f                  // Push program counter onto stack.
        0:
            pop     BaseP               // Get program counter.

    #endif

#define nx  t0
    movsd       Address(nZero), nx
    xorpd       x, nx                   // Negate x.

    ucomisd     x, y
    jae         yGEx                    // If we jump, y >= x.
    je          Unordered               // If we jump, an operand is a NaN.
                                        // If we fall through, y < x.

    ucomisd     y, nx
    jae         yLTx_and_nxGEy          // If we jump, y < x and -x >= y.
                                        // If we fall through, -x < y < x.

    // Return atand(y/x).
    divsd       x, y                    // Form y/x.
    movsd       y, x                    // Move to register used by Polynomial.
    movsd       Address(nZero), Base    // Set Base to -0.
        // This makes the return value -0 if y is -0 and x > 0.
    jmp         Polynomial              // Return 0 + arctangent(y/x).


yLTx_and_nxGEy:                         // Here y < x && y <= -x.
    je          yLTx_and_nxEQy          // If we jump, y < x && -x == y.
                                        // If we fall through, y < x < -y.

    // Return -pi/2 - atand(x/y).
    movsd       Address(nZero), t0      // Get -0 for sign bit.
    divsd       y, x                    // Form x/y.
    xorpd       t0, x                   // Form -x/y.
    movsd       Address(nPi1v2), Base   // Set Base to -pi/2.
    jmp         Polynomial              // Return -pi/2 + arctangent(-x/y).


yLTx_and_nxEQy:                         // Here -x == y < x.
    // Return -1/4*pi with inexact exception.
    cvtsd2ss    Address(nPi1v4), y
    jmp         ReturnSingle


yGEx:                                   // Here y >= x.
    je          yEQx                    // If we jump, y == x.
                                        // If we fall through, y > x.

    ucomisd     y, nx
#undef  nx
    jae         yGTx_and_nxGEy          // If we jump, y > x && -x >= y.
                                        // If we fall through, -y < x < y.

    // Return +pi/2 - atand(x/y).
    movsd       Address(nZero), t0      // Get -0 for sign bit.
    divsd       y, x                    // Form x/y.
    movsd       Address(pPi1v2), Base   // Set Base to +pi/2.
    xorpd       t0, x                   // Form -x/y.
    jmp         Polynomial              // Return +pi/2 + arctangent(-x/y).


yGTx_and_nxGEy:                         // Here y > x && -x >= y.
    je          yGTx_and_nxEQy          // If we jump, y > x && -x == y.
                                        // If we fall through, x < y < -x.

    movsd       Address(nZero), Base    // Get mask for sign bit.
    movapd      Base, t0                // Copy mask.
    andpd       y, Base                 // Extract sign bit of y.
    divsd       x, y                    // Form y/x.
    andnpd      y, t0                   // Take absolute value of quotient.
    comisd      Address(Threshold), t0  // Is quotient small?
    jbe         NearNegativeXAxis
    orpd        Address(pPi), Base      // Set Base to pi with y's sign.
    movsd       y, x                    // Move to register used by Polynomial.
    jmp         Polynomial              // Return +/- pi + arctangent(y/x).


NearNegativeXAxis:
    // Return -pi or +pi, matching the sign of y, rounded toward zero.
    movsd       Address(AlmostpPi), p0
    xorpd       Base, p0
    jmp         ReturnDouble


yGTx_and_nxEQy:                         // Here x < y == -x.
    // Return +3/4*pi with inexact exception.
    cvtsd2ss    Address(pPi3v4), y
    jmp         ReturnSingle


yEQx:                                   // Here y == x.

    ucomisd     Address(nZero), y
    jae         yEQx_and_yGE0           // If we jump, y == x && y >= 0.
                                        // If we fall through, x == y < -x.

    // Return -3/4*pi with inexact exception.
    cvtsd2ss    Address(nPi3v4), y
    jmp         ReturnSingle


yEQx_and_yGE0:                          // Here y == x && y >= 0.
    je          yEQx_and_yEQ0           // If we jump, y == x && y == 0.
                                        // If we fall through, -x < y == x.

    // Return +1/4*pi with inexact exception.
    cvtsd2ss    Address(pPi1v4), y
    jmp         ReturnSingle


yEQx_and_yEQ0:                          // Here y == x == 0.

    /*  Return:
            x   y   atan2f(y, x)
            -0  -0  -pi, with inexact exception.
            -0  +0  +pi, with inexact exception.
            +0  -0  -0.
            +0  +0  +0.
    */

    /*  We want to know if x is -0 or +0, but there is no direct test for that
        that puts the results in a vector register.  We do an arithmetic right
        shift to fill up the exponent bits with copies of the sign bit.  This
        produces a NaN or +0.  Then we test for "unordered", which yields all
        one bits if x was -0 and all zero bits if x was +0.
    */
    psraw       $12, x
    cmpunordsd  x, x

    // Form (almost) pi if x was -0 and 0 if x was +0.
    movsd       Address(AlmostpPi), p0
    andpd       x, p0

    orpd        y, p0                   // Apply the sign bit from y.
    jmp         ReturnDouble


Unordered:                              // Here x or y is a NaN.
    addsd       x, y                    // Return one of the NaNs, quieted.
    cvtsd2ss    y, y
    jmp         ReturnSingle


/*  This is the principal arctangent evaluation.  Previous code has prepared
    the Base and y registers, and we need to calculate Base + arctangent(y).
    The result is then converted to a single-precision number and returned.

    -1 <= y <= +1.  (Actually, -1 < y < +1.  The equalities were sidetracked
    during all the branching above, and division of two different
    single-precision numbers converted to double-precision never rounds to
    one.)

    There are some slight inefficiencies here, in that special cases could omit
    a few instructions -- sometimes the base is zero or y had to be negated to
    fit this common code.  So, if speed is all important, this routine might be
    speeded up a little by replicating this code.
*/
Polynomial:

/*  The polynomial that approximates arctangent has been arranged into the
    form:

        x * c2
            * ((x**4 + c01 * x**2 + c00))
            * ((x**4 + c11 * x**2 + c10))
            * ((x**4 + c21 * x**2 + c20))
            * ((x**4 + c31 * x**2 + c30))

    The coefficients are stored in pairs, with c01 and c21 at C01, c00 and c20
    at C00, c11 and c31 at C11, and c10 and c30 at C10.  c2 is at C2.

    The quartic factors are evaluated in SIMD registers.  For brevity, some
    comments below describe only one element of a register.  The other is
    analagous.
*/
    movsd       x, x1               // Save a copy of x for later.
    movapd      Address(C11), p1
    mulsd       x, x                // Form x**2.
#define x2  x   // Define name describing current register contents.
    movlhps     x2, x2              // Duplicate x**2.
    addpd       x2, p1              // Form x**2 + c11.
    mulpd       x2, p1              // Form x**4 + c11 * x**2.
    addpd       Address(C10), p1    // Form x**4 + c11 * x**2 + c10.
    movapd      Address(C01), p0    // Get first coefficients.
    addpd       x2, p0              // Form x**2 + c01.
    mulpd       x2, p0              // Form x**4 + c01 * x**2.
    movhpd      Address(C2), x1     // Put c2 in one element, with x in other.
    addpd       Address(C00), p0    // Form x**4 + c01 * x**2 + c00.
    mulpd       p1, p0              // Combine factors.
    mulpd       x1, p0              // Multiply by x and c2.
    movhlps     p0, p1              // Get high element.
    mulsd       p1, p0              // Finish combining factors.
#undef x2
    addsd       Base, p0

// Return the double-precision number currently in p0.
ReturnDouble:
    cvtsd2ss    p0, y               // Convert result to single precision.

// Return the single-precision number currently in p0.
ReturnSingle:

    #if defined __i386__
        movss       y, Argx         // Shuttle result through memory.
            // This uses the argument area for scratch space, which is allowed.
        flds        Argx            // Return input on floating-point stack.
    #else
        // On x86_64, the return value is now in y, which is %xmm0.
    #endif

    ret