// file kernel/n/alpha/montgomery.S: Montgomery modular exponentiation
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                   Exponentiation modulaire de Montgomery              |
 |                                                                       |
 +-----------------------------------------------------------------------*/


                          # +----------------------+
                          # |  Division modulaire  |
                          # +----------------------+


   # void xn(mgdiv_n2)(chiffre *a, chiffre *c, chiffre d, long n)
   #
   # entre :
   # a = naturel de longueur 2n+1
   # c = naturel de longueur n
   # d = -1/c mod BASE
   #
   # contraintes :
   # n >= 2, a[0..2n-1] <= (BASE^n - 1)^2, a,c non confondus
   #
   # sortie :
   # a[n..2n-1] <- a[0..2n-1]/BASE^n mod c, non normalis

#ifdef assembly_sn_mgdiv_n2
#define L(x) .Lsn_mgdiv_##x

        .align 5
        .globl sn_mgdiv_n2
        .ent   sn_mgdiv_n2
sn_mgdiv_n2:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)

	#define _a_    $20
	#define _c_    $16
	#define _d_    $18
	#define _l_    $21
	#define _n_    $19
	#define _r_    $17
	#define _s_    $22
	#define _u_    $0
	#define _v_    $1
	#define _i_    $2
	#define _x_    $5
	#define _y_    $6
	#define _m0_   $7
	#define _m1_   $8
	#define _c0_   $23
	#define _c1_   $24
	#define _mul1_ $4
	#define _mul2_ $25
	#define _sub_  $28

	# prpare le droulement de la boucle interne
	ldq    _c0_, 0($17)
	ldq    _c1_, 8($17)
	lda    _x_,  8($16)      # x <- &a[1]
	subq   $31,  _n_,  _l_
	addq   _l_,  1,    _l_   # l <- 1-n
	and    _l_,  31,   _u_   # u <- (1-n) mod 32
	bic    _l_,  31,   _l_   # l <- -32*ceil((n-1)/32)
	sll    _u_,   3,   _s_   # s  <- 8*((1-n) mod 32)
	subq   $16,  _s_,  _a_   # a <- &a[(1-n) mod 32]
	lda    _a_,  8(_a_)
	subq   $17,  _s_,  _c_   # c <- &c[(1-n) mod 32]
	lda    _c_,  8(_c_)
	lda    _sub_, sn_subloop
	lda    _mul1_,sn_muladdloop
	lda    _mul2_,sn_muladdloop2
	s4addq _u_,  _s_,  _v_
	subq   _v_,  _u_,  _v_    # v <- 11*((1-n) mod 32)
	addq   _v_,  _s_,  _u_    # u <- 19*((1-n) mod 32)
	s4addq _s_,  _sub_, _sub_ # sub <- adresse de saut pour subloop
	s4addq _v_,  _mul1_,_mul1_# mul1 <- adresse de saut pour muladdloop
	s4addq _u_,  _mul2_,_mul2_# mul2 <- adresse de saut pour muladdloop2
	
	# traite le premier chiffre  part si n est impair
	bis    $31,  $31,  _r_
	blbc   _n_,  L(loop)
	ldq    _u_,  -8(_x_)     # u <- a[0]
	stq    $31,  -8(_x_)     # a[0] <- 0
	mulq   _u_,  _d_,  _m1_  # m1 <- a[0]*d mod BASE
	umulh  _m1_, _c0_, _x_   # u <- (m1*c[0]+a[0])/BASE
	cmpult  $31, _u_,  _u_
	addq   _x_,  _u_,  _u_
	bis    _l_,  _l_,  _i_   # init compteur
	jsr    $27,  (_mul1_)    # a <- a + m*c
	ldq    _x_,  0(_a_)      # propage la retenue sur deux chiffres
	addq   _u_,  _x_,  _u_
	stq    _u_,  0(_a_)
	cmpult _u_,  _x_,  _u_
	ldq    _x_,  8(_a_)
	addq   _u_,  _x_,  _v_
	cmpult _v_,  _x_,  _r_
	stq    _v_,  8(_a_)
	lda    _a_,  8(_a_)
	lda    _n_,  -2(_n_)
	s8addq _l_,  _a_,  _a_
	s8addq _l_,  _c_,  _c_
	addq   _a_,  _s_,  _x_  # x <- &a[1]
	
	# boucle sur les chiffres suivants de a
	.align 5
L(loop):
	ldq    _u_,  -8(_x_)     # u:v <- a[i]:a[i+1]
	ldq    _v_,   0(_x_)
	stq    $31,  -8(_x_)     # a[i] <- 0
	mulq   _u_,  _d_,  _m0_  # m0 <- a[i]*d mod BASE
	umulh  _m0_, _c0_, _x_   # calcule m0*c[0]:c[1] et l ajoute
	cmpult $31,  _u_,  _u_   #  a[i]:a[i+1]. Le rsultat est 0:x
	addq   _x_,  _u_,  _u_   # u <- poids fort(a[i] + m0*c[0])
	mulq   _m0_, _c1_, _x_
	addq   _u_,  _v_,  _y_
	addq   _y_,  _x_,  _x_
	mulq   _x_,  _d_,  _m1_  # m1 <- x*d mod BASE
	mulq   _m1_, _c0_, _x_   # u:v <- u + m1*c[0] (= retenue)
	umulh  _m1_, _c0_, _v_
	addq   _u_,  _x_,  _u_
	cmpult _u_,  _x_,  _x_
	addq   _x_,  _v_,  _v_
	bis    _l_,  _l_,  _i_   # init compteur
	jsr    $27,  (_mul2_)    # a <- a + BASE^i*m*c
	ldq    _x_,  0(_a_)      # ajoute les retenues  a[i+n-1]:a[i+n]
	ldq    _y_,  8(_a_)
	addq   _r_,  _u_,  _u_
	cmpult _u_,  _r_,  _r_
	addq   _x_,  _u_,  _u_
	cmpult _u_,  _x_,  _x_
	addq   _x_,  _r_,  _r_
	addq   _r_,  _v_,  _v_
	cmpult _v_,  _r_,  _r_
	addq   _y_,  _v_,  _v_
	cmpult _v_,  _y_,  _y_
	addq   _y_,  _r_,  _r_  # r <- nouvelle retenue
	stq    _u_,  0(_a_)     # nouvel a[i+n-1]
	stq    _v_,  8(_a_)     # nouvel a[i+n]
	lda    _a_,  16(_a_)    # avance les pointeurs
	lda    _n_,  -2(_n_)
	s8addq _l_,  _a_,  _a_
	s8addq _l_,  _c_,  _c_
	addq   _a_,  _s_,  _x_  # x <- &a[i+1]
	bgt    _n_,  L(loop)
	
	# rinjecte la retenue
	bne    _r_,  L(ret)
	ret    $31,  ($26),1
L(ret):
	bis    _c_,  _c_,  _d_
	bis    _a_,  _a_,  _c_
	ldq    _u_,  -8(_x_)
	subq   _u_,  _c0_, _v_
	cmpult _u_,  _v_,  _u_
	stq    _v_,  -8(_x_)
	bis    _l_,  _l_,  _i_
	bis    $26,  $26,  $27	# appel terminal
	jmp    $31,  (_sub_)
	
	#undef _a_
	#undef _c_
	#undef _d_
	#undef _l_
	#undef _n_
	#undef _r_
	#undef _s_
	#undef _u_
	#undef _v_
	#undef _i_
	#undef _x_
	#undef _y_
	#undef _m0_
	#undef _m1_
	#undef _c0_
	#undef _c1_
	#undef _mul1_
	#undef _mul2_
	#undef _sub_

        .end   sn_mgdiv_n2
#undef L
#endif /* assembly_sn_mgdiv_n2 */

