/*  atanf.s -- atanf for standard math library.

    Written by Eric Postpischil, July 2007.
*/


    .literal8

// Define the points where our algorithm changes.
nPoint0:    .double -1
pPoint0:    .double +1
nPoint1:    .double -16777216
pPoint1:    .double +16777216

// Define miscellaneous constants.
nHalfPi:    .double -1.5707963267948966192313217
One:        .double  1
pHalfPi:    .double +1.5707963267948966192313217
Three:      .double  3

// Define a coefficient for center polynomial (used for x in [-1, +1]).
C2:         .double  0.0029352921857004596570518


    .const
    .align  4

/*  Define some coefficients for center polynomial (used for x in [-1, +1]).
    These are stored in pairs at aligned addresses for use in SIMD
    instructions.
*/
C01:        .double  2.2971562298314633761676433,  0.0207432003007420961489920
C00:        .double  2.4449692297316409651126041,  3.7888879287802702842997915
C11:        .double -2.9466967515109826289085300, -4.9721072376211623038916292
C10:        .double  5.4728447324456990092824269,  6.7197076223592378022736307

// Define coefficients for the tail polynomial (used for x outside [-1, +1]).
T01:    .double  0.9193672354545696501477531, -0.0052035944094405570566862
T00:    .double  0.3992772987220534996563340,  0.2521268658714555740707959
T11:    .double -0.5273186542866779494437308, -0.7201738803584184183894208
T10:    .double  0.1730466268612773143731748,  0.1408679409162453515360961


// Rename the general registers (just to make it easier to keep track of them).
#if defined __i386__
    #define r0  %eax
    #define r1  %ecx
    #define r2  %edx
    #define r3  %ebx
    #define r4  %esp
    #define r5  %ebp
    #define r6  %esi
    #define r7  %edi
#elif defined __x86_64__
    #define r0  %rax
    #define r1  %rcx
    #define r2  %rdx
    #define r3  %rbx
    #define r4  %rsp
    #define r5  %rbp
    #define r6  %rsi
    #define r7  %rdi
#else
    #error "Unknown architecture."
#endif


    .text


// Define various symbols.

#define BaseP       r0      // Base address for position-independent addressing.

#define p0          %xmm0   // Must be in %xmm0 for return on x86_64.
#define x           %xmm1
#define p1          %xmm2
#define x1          %xmm3
#define v0          %xmm4
#define t0          %xmm5

#if defined __i386__

    // Define location of argument x.
    #define Argx            4(%esp)

    // Define how to address data.  BaseP must contain the address of label 0.
    #define Address(label)  label-0b(BaseP)

#elif defined __x86_64__

    // Define location of argument x.
    #define Argx            %xmm0

    // Define how to address data.
    #define Address(label)  label(%rip)

#endif


/*  float atanf(float x).

    Notes:

        Citations in parentheses below indicate the source of a requirement.

        "C" stands for ISO/IEC 9899:TC2.

        The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no
        requirements since it defers to C and requires errno behavior only if
        we choose to support it by arranging for "math_errhandling &
        MATH_ERRNO" to be non-zero, which we do not.

    Return value:

        For arctangent of +/- zero, return zero with same sign (C F.9 12 and
        F.9.1.3).

        For arctangent of +/- infinity, return +/- pi/2.  (C F.9.1.3).

        For a NaN, return the same NaN (C F.9 11 and 13).  (If the NaN is a
        signalling NaN, we return the "same" NaN quieted.)

        Otherwise:

            If the rounding mode is round-to-nearest, return arctangent(x)
            faithfully rounded.

            Returns a value in [-pi/2, +pi/2] (C 7.12.4.3 3).  Note that this
            prohibits returning correctly rounded values for atanf of large
            positive or negative arguments, since pi/2 rounded to a float lies
            outside that interval.
        
            Not implemented:  In other rounding modes, return arctangent(x)
            possibly with slightly worse error, not necessarily honoring the
            rounding mode (Ali Sazegari narrowing C F.9 10).

    Exceptions:

        Raise underflow for a denormal result (C F.9 7 and Draft Standard for
        Floating-Point Arithmetic P754 Draft 1.2.5 9.5).  If the input is the
        smallest normal, underflow may or may not be raised.  This is stricter
        than the older 754 standard.

        May or may not raise inexact, even if the result is exact (C F.9 8).

        Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite
        of C 4.2.1)  but not if the input is a quiet NaN (C F.9 11).

        May not raise exceptions otherwise (C F.9 9).

    Properties:

        Monotonic.

        Exhaustive testing proved this routine returns faithfully rounded
        results.
*/
    .align  5
#if !defined DevelopmentInstrumentation
    // This is the regular name used in the deployed implementation.
        .globl _atanf
    _atanf:
#else
    // This is the name used for a special test version of the routine.
        .globl _atanfInstrumented
    _atanfInstrumented:
#endif

    cvtss2sd    Argx, x             // Convert x to double precision.

    #if defined __i386__

        // Get address of 0 in BaseP.
            call    0f              // Push program counter onto stack.
        0:
            pop     BaseP           // Get program counter.

    #endif

/*  We use different algorithms for different parts of the domain.  There
    is a "negative tail" from -infinity to nPoint0, a center from nPoint0 to
    pPoint0, and a "positive tail" from pPoint0 to +infinity.  Here, we compare
    and branch to the appropriate code.

    NaNs are weeded out in PositiveTail or NegativeTail.
*/

    ucomisd     Address(pPoint0), x
    ja          PositiveTail

    ucomisd     Address(nPoint0), x
    jb          NegativeTail


/*  Here we have nPoint0 <= x <= pPoint0.  This is handled with a simple
    evaluation of a polynomial that approximates arctangent.

    The polynomial has been arranged into the form:

        x * c2
            * ((x**4 + c01 * x**2 + c00))
            * ((x**4 + c11 * x**2 + c10))
            * ((x**4 + c21 * x**2 + c20))
            * ((x**4 + c31 * x**2 + c30))

    The coefficients are stored in pairs, with c01 and c21 at C01, c00 and c20
    at C00, c11 and c31 at C11, and c10 and c30 at C10.  c2 is at C2.

    The quartic factors are evaluated in SIMD registers.  For brevity, some
    comments below describe only one element of a register.  The other is
    analagous.
*/
    movlhps     x, x1               // Save a copy of x for later.
    mulsd       x, x                // Form x**2.
#define x2  x   // Define name describing current register contents.
    movlhps     x2, x2              // Duplicate x**2.
    movapd      Address(C11), p1
    addpd       x2, p1              // Form x**2 + c11.
    mulpd       x2, p1              // Form x**4 + c11 * x**2.
    addpd       Address(C10), p1    // Form x**4 + c11 * x**2 + c10.
    movapd      Address(C01), p0    // Get first coefficients.
    movlpd      Address(C2), x1     // Put c2 in one element, with x in other.
    addpd       x2, p0              // Form x**2 + c01.
    mulpd       x2, p0              // Form x**4 + c01 * x**2.
    addpd       Address(C00), p0    // Form x**4 + c01 * x**2 + c00.
    mulpd       p1, p0              // Combine factors.
    mulpd       x1, p0              // Multiply by x and c2.
    movhlps     p0, p1              // Get high element.
    mulsd       p1, p0              // Finish combining factors.
#undef x2

// Return the double-precision number currently in p0.
ReturnDoubleInp0:
    cvtsd2ss    p0, p0              // Convert result to single precision.

// Return the single-precision number currently in p0.
ReturnSingleInp0:

    #if defined __i386__
        movss       p0, Argx        // Shuttle result through memory.
            // This uses the argument area for scratch space, which is allowed.
        flds        Argx            // Return input on floating-point stack.
    #else
        // On x86_64, the return value is now in p0, which is %xmm0.
    #endif

    ret


// Handle pPoint0 < x.
PositiveTail:

    ucomisd     Address(pPoint1), x     // Compare x to algorithm change point.
    jae         InputIsPositiveSpecial

    movsd       Address(pHalfPi), p0    // Prepare +pi/2.

CommonTail:

/*  Here we have pPoint0 < x < pPoint1.  The algorithm here is inspired by the
    identity (for positive x) arctangent(x) = +pi/2 - arctangent(1/x).  This
    is approximated by evaluating:

        +pi/2 -
            * ((x**4 + t01 * x**2 + t00))
            * ((x**4 + t11 * x**2 + t10))
            * ((x**4 + t21 * x**2 + t20))
            * ((x**4 + t31 * x**2 + t30))
            * (1/x) * (1/x)**16.

    The quartic factors are evaluated in SIMD registers.  For brevity, some
    comments below describe only one element of a register.  The other is
    analagous.
*/

    movsd       Address(One), v0    // Get constant.
    divsd       x, v0               // Form 1/x, which we will call r.
        /*  I tried rcpss, but using it requires a precision conversion, two
            adds, and three multiplies.  divsd performance is not bad on Core2, 
            and I was not able to get the rcpss version to run as fast.
        */
    mulsd       x, x                // Form x**2.
#define x2  x   // Define name describing current register contents.
    movlhps     v0, v0              // Copy finished reciprocal to high element.
    movlhps     x2, x2              // Duplicate x**2.
    mulsd       v0, v0              // Form r**2.
    mulsd       v0, v0              // Form r**4.
    mulsd       v0, v0              // Form r**8.
    movapd      Address(T11), p1    // Get first coefficients.
    mulsd       v0, v0              // Form r**16.

    movapd      Address(T01), t0    // Get first coefficients.
    addpd       x2, p1              // Form x**2 + t11.
    mulpd       x2, p1              // Form x**4 + t11 * x**2.
    addpd       Address(T10), p1    // Form x**4 + t11 * x**2 + t10.
    addpd       x2, t0              // Form x**2 + t01.
    mulpd       x2, t0              // Form x**4 + t01 * x**2.
    addpd       Address(T00), t0    // Form x**4 + t01 * x**2 + t00.
    mulpd       p1, t0              // Combine factors.
    mulpd       v0, t0              // Multiply by r**16 and r.
    movhlps     t0, p1              // Get high element.
    mulsd       p1, t0              // Finish combining factors.
    subsd       t0, p0              // Subtract from pi/2.
#undef x2

    jmp         ReturnDoubleInp0


// Handle x < nPoint0.
NegativeTail:

    ucomisd     Address(nPoint1), x     // Compare x to algorithm change point.
    jbe         InputIsNegativeSpecial

    movsd       Address(nHalfPi), p0    // Prepare -pi/2.

    jmp         CommonTail


    .literal8
/*  Define value near +/- pi/2 that yield +/- pi/2 rounded toward zero when
    converted to single precision.  This allows us to generate inexact and
    return the desired values for atanf of big positive and negative numbers.
*/
AlmostpHalfPi:  .double +1.5707962
AlmostnHalfPi:  .double -1.5707962


    .text

/*  Here we handle inputs greater or equal to pPoint1, including infinity,
    but not including NaNs.
*/
InputIsPositiveSpecial:

    /*  The input is a big positive number.  Return +pi/2 rounded toward zero
        and generate an inexact exception.
    */
    cvtsd2ss    Address(AlmostpHalfPi), p0
    jmp         ReturnSingleInp0


/*  Here we handle inputs less than or equal to -1, including -infinity and
    NaNs.  The condition code must be set as if by using ucomisd to compare the
    input in the "source operand" to nPoint1 in the "destination operand".
*/
InputIsNegativeSpecial:

    jp          InputIsNaN

    /*  The input is a big negative number.  Return -pi/2 rounded toward zero
        and generate an inexact exception.
    */
    cvtsd2ss    Address(AlmostnHalfPi), p0
    jmp         ReturnSingleInp0


InputIsNaN:

    #if defined __i386__
        flds        Argx            // Return result on floating-point stack.
    #else
        cvtsd2ss    x, %xmm0        // Return in %xmm0.
            /*  We cannot just return the input, because we must quiet a
                signalling NaN.
            */
    #endif

    ret
