// file kernel/n/ppc32/sqrt_n2.S: O(n^2) square root of natural integers
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                          Racine carre quadratique                    |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                             ; +-----------------+
                             ; |  Racine carre  |
                             ; +-----------------+

; void xn(sqrt_n2)(chiffre *a, long la, chiffre *b)
;
; entre :
; a = naturel longueur la
; b = naturel de longueur la/2
;
; contraintes :
; la > 0, la pair, BASE/16 <= a[la-1] < BASE/4
; a,b non confondus
;
; sortie :
; b <- 2*floor(sqrt(a))
; a <- a - b^2/4

#ifdef assembly_sn_sqrt_n2
#define L(x) .Lsn_sqrt_n2_##x
#ifdef debug_sqrt_n2
.globl _sn_sqrt_n2_buggy
_sn_sqrt_n2_buggy:
#else
.globl _sn_sqrt_n2
_sn_sqrt_n2:
#endif

	; variables locales
	#define _a_    r12
	#define _la_   r8
	#define _b_    r11
	#define _b0_   r31
	#define _bi_   r30
	#define _bh_   r29
	#define _ah_   r2
	#define _msub_ r6
	#define _add_  r7
	#define _u_    r6
	#define _v_    r7
	#define _x_    r3
	#define _y_    r4
	#define _z_    r5
	#define _q_    r9

	mflr   r0
	stmw   r29,  4(r1)

	add    _b_,  r5,   r4	; b <- &b[la/2]
	add    _b_,  _b_,  r4
	subi   _b0_, _b_,  8	; b0 <- &b[la/2-2]
	subi   _la_, r4,   2	; la -= 2
	slwi   r2,   _la_, 2
	add    _a_,  r3,   r2 	; a <- &a[la-2]

	; calcule la racine carre des deux chiffres de tte
	lwz    _x_,  0(_a_)	; y:x <- a[1]:a[0]
	lwz    _y_,  4(_a_)
	lis    _bh_, 0x8000	; bh <- BASE/2
	subis  _y_,  _y_, 0x1000; y:x -= (bh/2)^2
	lis    _z_,  0x2000	; z <- BASE/8 (bit de test)
	slwi   _y_,  _y_,  2	; dcale le reste de 2 bits
	inslwi _y_,  _x_,  2,30
	slwi   _x_,  _x_,  2
1:
	add    _q_,  _z_,  _bh_ ; bit de test <- 1
	srwi   _u_,  _y_,  31	; dcale le reste d un bit
	addc   _x_,  _x_,  _x_
	adde   _y_,  _y_,  _y_
	subfc  _v_,  _q_,  _y_
	addme. _u_,  _u_	; a passe ?
	blt    2f
	add    _bh_, _z_,  _q_	; si oui, valide le bit
	mr     _y_,  _v_	; mise  jour du reste
2:
	srwi.  _z_,  _z_,  1
	bne    1b

	stw    _y_,  0(_a_)	; sauve le reste dans a[0]:a[1]
	stw    _z_,  4(_a_)
	stw    _bh_,-4(_b_)	; sauve le chiffre de tte de b
	and.   _la_, _la_, _la_
	beq    L(done)

	; prpare le droulement des boucles internes
	bcl    20,31, L(here)   ; adresses de saut pour 2 chiffres
L(here):
	mflr   r2
	addi   _add_,  r2,     lo16(Lsn_addloop    - L(here)  + 4*30*4)
/*	addis  _add_,  _add_,  ha16(Lsn_addloop    - L(here)  + 4*30*4) */
	addi   _msub_, r2,     lo16(Lsn_mulsubloop - L(here)  + 4*30*10)
/*	addis  _msub_, _msub_, ha16(Lsn_mulsubloop - L(here)  + 4*30*10) */
        INVERSE(_bh_,_bi_,_y_,_z_)

	; calcule les chiffres suivants par divisions
L(loop):
	; quotient approch, peut tre trop grand d une ou deux units
	lwz    _x_,  0(_a_)
	lwz    _y_, -4(_a_)
	mr     _ah_, _x_
        DIV(_y_,_x_,_bh_,_bi_,_q_,_z_)
	stw    _q_, 0(_b0_)	; b <- b + q

	; a <- a - v*q - q^2
	mtlr   _msub_
	subf   _x_, _b0_, _b_
	addi   _x_, _x_,  124
	srwi   _x_, _x_,  7
	mtctr  _x_		; ctr <- ceil(lb/32)
	slwi   _x_, _x_,  7
	subf   _a_, _x_,  _a_	; cadre a et b sur le multiple de 32 prc.
	subf   _b_, _x_,  _b_
	li     _x_, 0
	blrl
	lwz    _z_,  4(_b0_) 	; b <- b + 2q
	addc   _q_,  _q_, _q_
	addze  _z_,  _z_
	stw    _q_,  0(_b0_)
	stw    _z_,  4(_b0_)
	subf.  _ah_, _x_, _ah_	; dernire retenue de la soustraction

	; corrige le quotient et le reste si < 0
	beq    L(q_ok)
L(corr):
	mtlr   _add_
	subic  _q_, _q_,  2	; 2q <- 2q-2, CA <- 1
	addme  _z_, _z_
	stw    _q_,  0(_b0_)
	stw    _z_,  4(_b0_)
	subf   _x_, _b0_, _b_
	addi   _x_, _x_,  124
	srwi   _x_, _x_,  7
	mtctr  _x_		; ctr <- ceil(lb/32)
	slwi   _x_, _x_,  7
	subf   _a_, _x_,  _a_	; cadre a et b sur le multiple de 32 prc.
	subf   _b_, _x_,  _b_
	mr     r10, _a_
	blrl
	addze. _ah_, _ah_	; dernire retenue
	bne    L(corr)

	# chiffre suivant
L(q_ok):
	subi   _msub_,_msub_,40 ; recule les adresses de saut
	subi   _add_, _add_, 16
	subf   _x_,   _b0_,  _b_
	andi.  _x_,   _x_, 0x7f	; si on franchit un multiple de 32
	bne    1f		; repart en fin de boucle
	addi   _msub_,_msub_,4*32*10
	addi   _add_, _add_, 4*32*4
1:
	subic. _la_, _la_, 2	; la -= 2
	subi   _b0_, _b0_, 4	; lb++
	subi   _a_,  _a_,  4	; a--
	bne    L(loop)
	
	# termin
L(done):
	mtlr   r0
	lmw    r29, 4(r1)
	blr

	#undef _a_
	#undef _la_
	#undef _b_
	#undef _b0_
	#undef _bi_
	#undef _bh_
	#undef _ah_
	#undef _msub_
	#undef _add_
	#undef _u_
	#undef _v_
	#undef _x_
	#undef _y_
	#undef _z_
	#undef _q_
	
#undef L
#endif /* assembly_sn_sqrt_n2 */
