sysdeps/i386/fpu/s_cbrtl.S

   1 /* Compute cubic root of long double value.
   2    Copyright (C) 1997-2022 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <libm-alias-ldouble.h>
  20 #include <machine/asm.h>
  21
  22         .section .rodata
  23
  24         .align ALIGNARG(4)
  25         .type f8,@object
  26 f8:     .tfloat 0.161617097923756032
  27         ASM_SIZE_DIRECTIVE(f8)
  28         .align ALIGNARG(4)
  29         .type f7,@object
  30 f7:     .tfloat -0.988553671195413709
  31         ASM_SIZE_DIRECTIVE(f7)
  32         .align ALIGNARG(4)
  33         .type f6,@object
  34 f6:     .tfloat 2.65298938441952296
  35         ASM_SIZE_DIRECTIVE(f6)
  36         .align ALIGNARG(4)
  37         .type f5,@object
  38 f5:     .tfloat -4.11151425200350531
  39         ASM_SIZE_DIRECTIVE(f5)
  40         .align ALIGNARG(4)
  41         .type f4,@object
  42 f4:     .tfloat 4.09559907378707839
  43         ASM_SIZE_DIRECTIVE(f4)
  44         .align ALIGNARG(4)
  45         .type f3,@object
  46 f3:     .tfloat -2.82414939754975962
  47         ASM_SIZE_DIRECTIVE(f3)
  48         .align ALIGNARG(4)
  49         .type f2,@object
  50 f2:     .tfloat 1.67595307700780102
  51         ASM_SIZE_DIRECTIVE(f2)
  52         .align ALIGNARG(4)
  53         .type f1,@object
  54 f1:     .tfloat 0.338058687610520237
  55         ASM_SIZE_DIRECTIVE(f1)
  56
  57 #define CBRT2           1.2599210498948731648
  58 #define ONE_CBRT2       0.793700525984099737355196796584
  59 #define SQR_CBRT2       1.5874010519681994748
  60 #define ONE_SQR_CBRT2   0.629960524947436582364439673883
  61
  62         /* We make the entries in the following table all 16 bytes
  63            wide to avoid having to implement a multiplication by 10.  */
  64         .type factor,@object
  65         .align ALIGNARG(4)
  66 factor: .tfloat ONE_SQR_CBRT2
  67         .byte 0, 0, 0, 0, 0, 0
  68         .tfloat ONE_CBRT2
  69         .byte 0, 0, 0, 0, 0, 0
  70         .tfloat 1.0
  71         .byte 0, 0, 0, 0, 0, 0
  72         .tfloat CBRT2
  73         .byte 0, 0, 0, 0, 0, 0
  74         .tfloat SQR_CBRT2
  75         ASM_SIZE_DIRECTIVE(factor)
  76
  77         .type two64,@object
  78         .align ALIGNARG(4)
  79 two64:  .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43
  80         ASM_SIZE_DIRECTIVE(two64)
  81
  82 #ifdef PIC
  83 #define MO(op) op##@GOTOFF(%ebx)
  84 #define MOX(op,x) op##@GOTOFF(%ebx,x,1)
  85 #else
  86 #define MO(op) op
  87 #define MOX(op,x) op(x)
  88 #endif
  89
  90         .text
  91 ENTRY(__cbrtl)
  92         movl    4(%esp), %ecx
  93         movl    12(%esp), %eax
  94         orl     8(%esp), %ecx
  95         movl    %eax, %edx
  96         andl    $0x7fff, %eax
  97         orl     %eax, %ecx
  98         jz      1f
  99         xorl    %ecx, %ecx
 100         cmpl    $0x7fff, %eax
 101         je      1f
 102
 103 #ifdef PIC
 104         pushl   %ebx
 105         cfi_adjust_cfa_offset (4)
 106         cfi_rel_offset (ebx, 0)
 107         LOAD_PIC_REG (bx)
 108 #endif
 109
 110         cmpl    $0, %eax
 111         jne     2f
 112
 113 #ifdef PIC
 114         fldt    8(%esp)
 115 #else
 116         fldt    4(%esp)
 117 #endif
 118         fmull   MO(two64)
 119         movl    $-64, %ecx
 120 #ifdef PIC
 121         fstpt   8(%esp)
 122         movl    16(%esp), %eax
 123 #else
 124         fstpt   4(%esp)
 125         movl    12(%esp), %eax
 126 #endif
 127         movl    %eax, %edx
 128         andl    $0x7fff, %eax
 129
 130 2:      andl    $0x8000, %edx
 131         subl    $16382, %eax
 132         orl     $0x3ffe, %edx
 133         addl    %eax, %ecx
 134 #ifdef PIC
 135         movl    %edx, 16(%esp)
 136
 137         fldt    8(%esp)                 /* xm */
 138 #else
 139         movl    %edx, 12(%esp)
 140
 141         fldt    4(%esp)                 /* xm */
 142 #endif
 143         fabs
 144
 145         /* The following code has two tracks:
 146             a) compute the normalized cbrt value
 147             b) compute xe/3 and xe%3
 148            The right track computes the value for b) and this is done
 149            in an optimized way by avoiding division.
 150
 151            But why two tracks at all?  Very easy: efficiency.  Some FP
 152            instruction can overlap with a certain amount of integer (and
 153            FP) instructions.  So we get (except for the imull) all
 154            instructions for free.  */
 155
 156         fldt    MO(f8)                  /* f8 : xm */
 157         fmul    %st(1)                  /* f8*xm : xm */
 158
 159         fldt    MO(f7)
 160         faddp                           /* f7+f8*xm : xm */
 161         fmul    %st(1)                  /* (f7+f8*xm)*xm : xm */
 162                         movl    $1431655766, %eax
 163         fldt    MO(f6)
 164         faddp                           /* f6+(f7+f8*xm)*xm : xm */
 165                         imull   %ecx
 166         fmul    %st(1)                  /* (f6+(f7+f8*xm)*xm)*xm : xm */
 167                         movl    %ecx, %eax
 168         fldt    MO(f5)
 169         faddp                           /* f5+(f6+(f7+f8*xm)*xm)*xm : xm */
 170                         sarl    $31, %eax
 171         fmul    %st(1)                  /* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
 172                         subl    %eax, %edx
 173         fldt    MO(f4)
 174         faddp                           /* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */
 175         fmul    %st(1)                  /* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
 176         fldt    MO(f3)
 177         faddp                           /* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */
 178         fmul    %st(1)                  /* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
 179         fldt    MO(f2)
 180         faddp                           /* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */
 181         fmul    %st(1)                  /* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
 182         fldt    MO(f1)
 183         faddp                           /* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */
 184
 185         fld     %st                     /* u : u : xm */
 186         fmul    %st(1)                  /* u*u : u : xm */
 187         fld     %st(2)                  /* xm : u*u : u : xm */
 188         fadd    %st                     /* 2*xm : u*u : u : xm */
 189         fxch    %st(1)                  /* u*u : 2*xm : u : xm */
 190         fmul    %st(2)                  /* t2:=u*u*u : 2*xm : u : xm */
 191                         movl    %edx, %eax
 192         fadd    %st, %st(1)             /* t2 : t2+2*xm : u : xm */
 193                         leal    (%edx,%edx,2),%edx
 194         fadd    %st(0)                  /* 2*t2 : t2+2*xm : u : xm */
 195                         subl    %edx, %ecx
 196         faddp   %st, %st(3)             /* t2+2*xm : u : 2*t2+xm */
 197                         shll    $4, %ecx
 198         fmulp                           /* u*(t2+2*xm) : 2*t2+xm */
 199         fdivp   %st, %st(1)             /* u*(t2+2*xm)/(2*t2+xm) */
 200         fldt    MOX(32+factor,%ecx)
 201         fmulp                           /* u*(t2+2*xm)/(2*t2+xm)*FACT */
 202         pushl   %eax
 203         cfi_adjust_cfa_offset (4)
 204         fildl   (%esp)                  /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */
 205         fxch                            /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */
 206         fscale                          /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */
 207         popl    %edx
 208         cfi_adjust_cfa_offset (-4)
 209 #ifdef PIC
 210         movl    16(%esp), %eax
 211         popl    %ebx
 212         cfi_adjust_cfa_offset (-4)
 213         cfi_restore (ebx)
 214 #else
 215         movl    12(%esp), %eax
 216 #endif
 217         testl   $0x8000, %eax
 218         fstp    %st(1)
 219         jz      4f
 220         fchs
 221 4:      ret
 222
 223         /* Return the argument.  */
 224 1:      fldt    4(%esp)
 225         fadd    %st
 226         ret
 227 END(__cbrtl)
 228 libm_alias_ldouble (__cbrt, cbrt)