// Copyright (c) 2020-2024 The Decred developers
// Use of this source code is governed by an ISC
// license that can be found in the LICENSE file.

package secp256k1

import (
	
	
)

// References:
//   [SECG]: Recommended Elliptic Curve Domain Parameters
//     https://www.secg.org/sec2-v2.pdf
//
//   [HAC]: Handbook of Applied Cryptography Menezes, van Oorschot, Vanstone.
//     http://cacr.uwaterloo.ca/hac/

// Many elliptic curve operations require working with scalars in a finite field
// characterized by the order of the group underlying the secp256k1 curve.
// Given this precision is larger than the biggest available native type,
// obviously some form of bignum math is needed.  This code implements
// specialized fixed-precision field arithmetic rather than relying on an
// arbitrary-precision arithmetic package such as math/big for dealing with the
// math modulo the group order since the size is known.  As a result, rather
// large performance gains are achieved by taking advantage of many
// optimizations not available to arbitrary-precision arithmetic and generic
// modular arithmetic algorithms.
//
// There are various ways to internally represent each element.  For example,
// the most obvious representation would be to use an array of 4 uint64s (64
// bits * 4 = 256 bits).  However, that representation suffers from the fact
// that there is no native Go type large enough to handle the intermediate
// results while adding or multiplying two 64-bit numbers.
//
// Given the above, this implementation represents the field elements as 8
// uint32s with each word (array entry) treated as base 2^32.  This was chosen
// because most systems at the current time are 64-bit (or at least have 64-bit
// registers available for specialized purposes such as MMX) so the intermediate
// results can typically be done using a native register (and using uint64s to
// avoid the need for additional half-word arithmetic)

const (
	// These fields provide convenient access to each of the words of the
	// secp256k1 curve group order N to improve code readability.
	//
	// The group order of the curve per [SECG] is:
	// 0xffffffff ffffffff ffffffff fffffffe baaedce6 af48a03b bfd25e8c d0364141
	//
	// nolint: dupword
	orderWordZero  uint32 = 0xd0364141
	orderWordOne   uint32 = 0xbfd25e8c
	orderWordTwo   uint32 = 0xaf48a03b
	orderWordThree uint32 = 0xbaaedce6
	orderWordFour  uint32 = 0xfffffffe
	orderWordFive  uint32 = 0xffffffff
	orderWordSix   uint32 = 0xffffffff
	orderWordSeven uint32 = 0xffffffff

	// These fields provide convenient access to each of the words of the two's
	// complement of the secp256k1 curve group order N to improve code
	// readability.
	//
	// The two's complement of the group order is:
	// 0x00000000 00000000 00000000 00000001 45512319 50b75fc4 402da173 2fc9bebf
	orderComplementWordZero  uint32 = (^orderWordZero) + 1
	orderComplementWordOne   uint32 = ^orderWordOne
	orderComplementWordTwo   uint32 = ^orderWordTwo
	orderComplementWordThree uint32 = ^orderWordThree
	// orderComplementWordFour  uint32 = ^orderWordFour  // unused
	// orderComplementWordFive  uint32 = ^orderWordFive  // unused
	// orderComplementWordSix   uint32 = ^orderWordSix   // unused
	// orderComplementWordSeven uint32 = ^orderWordSeven // unused

	// These fields provide convenient access to each of the words of the
	// secp256k1 curve group order N / 2 to improve code readability and avoid
	// the need to recalculate them.
	//
	// The half order of the secp256k1 curve group is:
	// 0x7fffffff ffffffff ffffffff ffffffff 5d576e73 57a4501d dfe92f46 681b20a0
	//
	// nolint: dupword
	halfOrderWordZero  uint32 = 0x681b20a0
	halfOrderWordOne   uint32 = 0xdfe92f46
	halfOrderWordTwo   uint32 = 0x57a4501d
	halfOrderWordThree uint32 = 0x5d576e73
	halfOrderWordFour  uint32 = 0xffffffff
	halfOrderWordFive  uint32 = 0xffffffff
	halfOrderWordSix   uint32 = 0xffffffff
	halfOrderWordSeven uint32 = 0x7fffffff

	// uint32Mask is simply a mask with all bits set for a uint32 and is used to
	// improve the readability of the code.
	uint32Mask = 0xffffffff
)

var (
	// zero32 is an array of 32 bytes used for the purposes of zeroing and is
	// defined here to avoid extra allocations.
	zero32 = [32]byte{}
)

// ModNScalar implements optimized 256-bit constant-time fixed-precision
// arithmetic over the secp256k1 group order. This means all arithmetic is
// performed modulo:
//
//	0xfffffffffffffffffffffffffffffffebaaedce6af48a03bbfd25e8cd0364141
//
// It only implements the arithmetic needed for elliptic curve operations,
// however, the operations that are not implemented can typically be worked
// around if absolutely needed.  For example, subtraction can be performed by
// adding the negation.
//
// Should it be absolutely necessary, conversion to the standard library
// math/big.Int can be accomplished by using the Bytes method, slicing the
// resulting fixed-size array, and feeding it to big.Int.SetBytes.  However,
// that should typically be avoided when possible as conversion to big.Ints
// requires allocations, is not constant time, and is slower when working modulo
// the group order.
type ModNScalar struct {
	// The scalar is represented as 8 32-bit integers in base 2^32.
	//
	// The following depicts the internal representation:
	// 	 ---------------------------------------------------------
	// 	|       n[7]     |      n[6]      | ... |      n[0]      |
	// 	| 32 bits        | 32 bits        | ... | 32 bits        |
	// 	| Mult: 2^(32*7) | Mult: 2^(32*6) | ... | Mult: 2^(32*0) |
	// 	 ---------------------------------------------------------
	//
	// For example, consider the number 2^87 + 2^42 + 1.  It would be
	// represented as:
	// 	n[0] = 1
	// 	n[1] = 2^10
	// 	n[2] = 2^23
	// 	n[3..7] = 0
	//
	// The full 256-bit value is then calculated by looping i from 7..0 and
	// doing sum(n[i] * 2^(32i)) like so:
	// 	n[7] * 2^(32*7) = 0    * 2^224 = 0
	// 	n[6] * 2^(32*6) = 0    * 2^192 = 0
	// 	...
	// 	n[2] * 2^(32*2) = 2^23 * 2^64  = 2^87
	// 	n[1] * 2^(32*1) = 2^10 * 2^32  = 2^42
	// 	n[0] * 2^(32*0) = 1    * 2^0   = 1
	// 	Sum: 0 + 0 + ... + 2^87 + 2^42 + 1 = 2^87 + 2^42 + 1
	n [8]uint32
}

// String returns the scalar as a human-readable hex string.
//
// This is NOT constant time.
func ( ModNScalar) () string {
	 := .Bytes()
	return hex.EncodeToString([:])
}

// Set sets the scalar equal to a copy of the passed one in constant time.
//
// The scalar is returned to support chaining.  This enables syntax like:
// s := new(ModNScalar).Set(s2).Add(1) so that s = s2 + 1 where s2 is not
// modified.
func ( *ModNScalar) ( *ModNScalar) *ModNScalar {
	* = *
	return 
}

// Zero sets the scalar to zero in constant time.  A newly created scalar is
// already set to zero.  This function can be useful to clear an existing scalar
// for reuse.
func ( *ModNScalar) () {
	.n[0] = 0
	.n[1] = 0
	.n[2] = 0
	.n[3] = 0
	.n[4] = 0
	.n[5] = 0
	.n[6] = 0
	.n[7] = 0
}

// IsZeroBit returns 1 when the scalar is equal to zero or 0 otherwise in
// constant time.
//
// Note that a bool is not used here because it is not possible in Go to convert
// from a bool to numeric value in constant time and many constant-time
// operations require a numeric value.  See IsZero for the version that returns
// a bool.
func ( *ModNScalar) () uint32 {
	// The scalar can only be zero if no bits are set in any of the words.
	 := .n[0] | .n[1] | .n[2] | .n[3] | .n[4] | .n[5] | .n[6] | .n[7]
	return constantTimeEq(, 0)
}

// IsZero returns whether or not the scalar is equal to zero in constant time.
func ( *ModNScalar) () bool {
	// The scalar can only be zero if no bits are set in any of the words.
	 := .n[0] | .n[1] | .n[2] | .n[3] | .n[4] | .n[5] | .n[6] | .n[7]
	return  == 0
}

// SetInt sets the scalar to the passed integer in constant time.  This is a
// convenience function since it is fairly common to perform some arithmetic
// with small native integers.
//
// The scalar is returned to support chaining.  This enables syntax like:
// s := new(ModNScalar).SetInt(2).Mul(s2) so that s = 2 * s2.
func ( *ModNScalar) ( uint32) *ModNScalar {
	.Zero()
	.n[0] = 
	return 
}

// constantTimeEq returns 1 if a == b or 0 otherwise in constant time.
func constantTimeEq(,  uint32) uint32 {
	return uint32((uint64(^) - 1) >> 63)
}

// constantTimeNotEq returns 1 if a != b or 0 otherwise in constant time.
func constantTimeNotEq(,  uint32) uint32 {
	return ^uint32((uint64(^)-1)>>63) & 1
}

// constantTimeLess returns 1 if a < b or 0 otherwise in constant time.
func constantTimeLess(,  uint32) uint32 {
	return uint32((uint64() - uint64()) >> 63)
}

// constantTimeLessOrEq returns 1 if a <= b or 0 otherwise in constant time.
func constantTimeLessOrEq(,  uint32) uint32 {
	return uint32((uint64() - uint64() - 1) >> 63)
}

// constantTimeGreater returns 1 if a > b or 0 otherwise in constant time.
func constantTimeGreater(,  uint32) uint32 {
	return constantTimeLess(, )
}

// constantTimeGreaterOrEq returns 1 if a >= b or 0 otherwise in constant time.
func constantTimeGreaterOrEq(,  uint32) uint32 {
	return constantTimeLessOrEq(, )
}

// constantTimeMin returns min(a,b) in constant time.
func constantTimeMin(,  uint32) uint32 {
	return  ^ (( ^ ) & -constantTimeLess(, ))
}

// overflows determines if the current scalar is greater than or equal to the
// group order in constant time and returns 1 if it is or 0 otherwise.
func ( *ModNScalar) () uint32 {
	// The intuition here is that the scalar is greater than the group order if
	// one of the higher individual words is greater than corresponding word of
	// the group order and all higher words in the scalar are equal to their
	// corresponding word of the group order.  Since this type is modulo the
	// group order, being equal is also an overflow back to 0.
	//
	// Note that the words 5, 6, and 7 are all the max uint32 value, so there is
	// no need to test if those individual words of the scalar exceeds them,
	// hence, only equality is checked for them.
	 := constantTimeEq(.n[7], orderWordSeven)
	 &= constantTimeEq(.n[6], orderWordSix)
	 &= constantTimeEq(.n[5], orderWordFive)
	 :=  & constantTimeGreater(.n[4], orderWordFour)
	 &= constantTimeEq(.n[4], orderWordFour)
	 |=  & constantTimeGreater(.n[3], orderWordThree)
	 &= constantTimeEq(.n[3], orderWordThree)
	 |=  & constantTimeGreater(.n[2], orderWordTwo)
	 &= constantTimeEq(.n[2], orderWordTwo)
	 |=  & constantTimeGreater(.n[1], orderWordOne)
	 &= constantTimeEq(.n[1], orderWordOne)
	 |=  & constantTimeGreaterOrEq(.n[0], orderWordZero)

	return 
}

// reduce256 reduces the current scalar modulo the group order in accordance
// with the overflows parameter in constant time.  The overflows parameter
// specifies whether or not the scalar is known to be greater than the group
// order and MUST either be 1 in the case it is or 0 in the case it is not for a
// correct result.
func ( *ModNScalar) ( uint32) {
	// Notice that since s < 2^256 < 2N (where N is the group order), the max
	// possible number of reductions required is one.  Therefore, in the case a
	// reduction is needed, it can be performed with a single subtraction of N.
	// Also, recall that subtraction is equivalent to addition by the two's
	// complement while ignoring the carry.
	//
	// When s >= N, the overflows parameter will be 1.  Conversely, it will be 0
	// when s < N.  Thus multiplying by the overflows parameter will either
	// result in 0 or the multiplicand itself.
	//
	// Combining the above along with the fact that s + 0 = s, the following is
	// a constant time implementation that works by either adding 0 or the two's
	// complement of N as needed.
	//
	// The final result will be in the range 0 <= s < N as expected.
	 := uint64()
	 := uint64(.n[0]) + *uint64(orderComplementWordZero)
	.n[0] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[1]) + *uint64(orderComplementWordOne)
	.n[1] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[2]) + *uint64(orderComplementWordTwo)
	.n[2] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[3]) + *uint64(orderComplementWordThree)
	.n[3] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[4]) +  // * 1
	.n[4] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[5]) // + overflows64 * 0
	.n[5] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[6]) // + overflows64 * 0
	.n[6] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[7]) // + overflows64 * 0
	.n[7] = uint32( & uint32Mask)
}

// SetBytes interprets the provided array as a 256-bit big-endian unsigned
// integer, reduces it modulo the group order, sets the scalar to the result,
// and returns either 1 if it was reduced (aka it overflowed) or 0 otherwise in
// constant time.
//
// Note that a bool is not used here because it is not possible in Go to convert
// from a bool to numeric value in constant time and many constant-time
// operations require a numeric value.
func ( *ModNScalar) ( *[32]byte) uint32 {
	// Pack the 256 total bits across the 8 uint32 words.  This could be done
	// with a for loop, but benchmarks show this unrolled version is about 2
	// times faster than the variant that uses a loop.
	.n[0] = uint32([31]) | uint32([30])<<8 | uint32([29])<<16 | uint32([28])<<24
	.n[1] = uint32([27]) | uint32([26])<<8 | uint32([25])<<16 | uint32([24])<<24
	.n[2] = uint32([23]) | uint32([22])<<8 | uint32([21])<<16 | uint32([20])<<24
	.n[3] = uint32([19]) | uint32([18])<<8 | uint32([17])<<16 | uint32([16])<<24
	.n[4] = uint32([15]) | uint32([14])<<8 | uint32([13])<<16 | uint32([12])<<24
	.n[5] = uint32([11]) | uint32([10])<<8 | uint32([9])<<16 | uint32([8])<<24
	.n[6] = uint32([7]) | uint32([6])<<8 | uint32([5])<<16 | uint32([4])<<24
	.n[7] = uint32([3]) | uint32([2])<<8 | uint32([1])<<16 | uint32([0])<<24

	// The value might be >= N, so reduce it as required and return whether or
	// not it was reduced.
	 := .overflows()
	.reduce256()
	return 
}

// zeroArray32 zeroes the provided 32-byte buffer.
func zeroArray32( *[32]byte) {
	copy([:], zero32[:])
}

// SetByteSlice interprets the provided slice as a 256-bit big-endian unsigned
// integer (meaning it is truncated to the first 32 bytes), reduces it modulo
// the group order, sets the scalar to the result, and returns whether or not
// the resulting truncated 256-bit integer overflowed in constant time.
//
// Note that since passing a slice with more than 32 bytes is truncated, it is
// possible that the truncated value is less than the order of the curve and
// hence it will not be reported as having overflowed in that case.  It is up to
// the caller to decide whether it needs to provide numbers of the appropriate
// size or it is acceptable to use this function with the described truncation
// and overflow behavior.
func ( *ModNScalar) ( []byte) bool {
	var  [32]byte
	 = [:constantTimeMin(uint32(len()), 32)]
	copy([:], [:32-len()])
	copy([32-len():], )
	 := .SetBytes(&)
	zeroArray32(&)
	return  != 0
}

// PutBytesUnchecked unpacks the scalar to a 32-byte big-endian value directly
// into the passed byte slice in constant time.  The target slice must have at
// least 32 bytes available or it will panic.
//
// There is a similar function, PutBytes, which unpacks the scalar into a
// 32-byte array directly.  This version is provided since it can be useful to
// write directly into part of a larger buffer without needing a separate
// allocation.
//
// Preconditions:
//   - The target slice MUST have at least 32 bytes available
func ( *ModNScalar) ( []byte) {
	// Unpack the 256 total bits from the 8 uint32 words.  This could be done
	// with a for loop, but benchmarks show this unrolled version is about 2
	// times faster than the variant which uses a loop.
	[31] = byte(.n[0])
	[30] = byte(.n[0] >> 8)
	[29] = byte(.n[0] >> 16)
	[28] = byte(.n[0] >> 24)
	[27] = byte(.n[1])
	[26] = byte(.n[1] >> 8)
	[25] = byte(.n[1] >> 16)
	[24] = byte(.n[1] >> 24)
	[23] = byte(.n[2])
	[22] = byte(.n[2] >> 8)
	[21] = byte(.n[2] >> 16)
	[20] = byte(.n[2] >> 24)
	[19] = byte(.n[3])
	[18] = byte(.n[3] >> 8)
	[17] = byte(.n[3] >> 16)
	[16] = byte(.n[3] >> 24)
	[15] = byte(.n[4])
	[14] = byte(.n[4] >> 8)
	[13] = byte(.n[4] >> 16)
	[12] = byte(.n[4] >> 24)
	[11] = byte(.n[5])
	[10] = byte(.n[5] >> 8)
	[9] = byte(.n[5] >> 16)
	[8] = byte(.n[5] >> 24)
	[7] = byte(.n[6])
	[6] = byte(.n[6] >> 8)
	[5] = byte(.n[6] >> 16)
	[4] = byte(.n[6] >> 24)
	[3] = byte(.n[7])
	[2] = byte(.n[7] >> 8)
	[1] = byte(.n[7] >> 16)
	[0] = byte(.n[7] >> 24)
}

// PutBytes unpacks the scalar to a 32-byte big-endian value using the passed
// byte array in constant time.
//
// There is a similar function, PutBytesUnchecked, which unpacks the scalar into
// a slice that must have at least 32 bytes available.  This version is provided
// since it can be useful to write directly into an array that is type checked.
//
// Alternatively, there is also Bytes, which unpacks the scalar into a new array
// and returns that which can sometimes be more ergonomic in applications that
// aren't concerned about an additional copy.
func ( *ModNScalar) ( *[32]byte) {
	.PutBytesUnchecked([:])
}

// Bytes unpacks the scalar to a 32-byte big-endian value in constant time.
//
// See PutBytes and PutBytesUnchecked for variants that allow an array or slice
// to be passed which can be useful to cut down on the number of allocations
// by allowing the caller to reuse a buffer or write directly into part of a
// larger buffer.
func ( *ModNScalar) () [32]byte {
	var  [32]byte
	.PutBytesUnchecked([:])
	return 
}

// IsOdd returns whether or not the scalar is an odd number in constant time.
func ( *ModNScalar) () bool {
	// Only odd numbers have the bottom bit set.
	return .n[0]&1 == 1
}

// Equals returns whether or not the two scalars are the same in constant time.
func ( *ModNScalar) ( *ModNScalar) bool {
	// Xor only sets bits when they are different, so the two scalars can only
	// be the same if no bits are set after xoring each word.
	 := (.n[0] ^ .n[0]) | (.n[1] ^ .n[1]) | (.n[2] ^ .n[2]) |
		(.n[3] ^ .n[3]) | (.n[4] ^ .n[4]) | (.n[5] ^ .n[5]) |
		(.n[6] ^ .n[6]) | (.n[7] ^ .n[7])

	return  == 0
}

// Add2 adds the passed two scalars together modulo the group order in constant
// time and stores the result in s.
//
// The scalar is returned to support chaining.  This enables syntax like:
// s3.Add2(s, s2).AddInt(1) so that s3 = s + s2 + 1.
func ( *ModNScalar) (,  *ModNScalar) *ModNScalar {
	 := uint64(.n[0]) + uint64(.n[0])
	.n[0] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[1]) + uint64(.n[1])
	.n[1] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[2]) + uint64(.n[2])
	.n[2] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[3]) + uint64(.n[3])
	.n[3] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[4]) + uint64(.n[4])
	.n[4] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[5]) + uint64(.n[5])
	.n[5] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[6]) + uint64(.n[6])
	.n[6] = uint32( & uint32Mask)
	 = ( >> 32) + uint64(.n[7]) + uint64(.n[7])
	.n[7] = uint32( & uint32Mask)

	// The result is now 256 bits, but it might still be >= N, so use the
	// existing normal reduce method for 256-bit values.
	.reduce256(uint32(>>32) + .overflows())
	return 
}

// Add adds the passed scalar to the existing one modulo the group order in
// constant time and stores the result in s.
//
// The scalar is returned to support chaining.  This enables syntax like:
// s.Add(s2).AddInt(1) so that s = s + s2 + 1.
func ( *ModNScalar) ( *ModNScalar) *ModNScalar {
	return .Add2(, )
}

// accumulator96 provides a 96-bit accumulator for use in the intermediate
// calculations requiring more than 64-bits.
type accumulator96 struct {
	n [3]uint32
}

// Add adds the passed unsigned 64-bit value to the accumulator.
func ( *accumulator96) ( uint64) {
	 := uint32( & uint32Mask)
	 := uint32( >> 32)
	.n[0] += 
	 += constantTimeLess(.n[0], ) // Carry if overflow in n[0].
	.n[1] += 
	.n[2] += constantTimeLess(.n[1], ) // Carry if overflow in n[1].
}

// Rsh32 right shifts the accumulator by 32 bits.
func ( *accumulator96) () {
	.n[0] = .n[1]
	.n[1] = .n[2]
	.n[2] = 0
}

// reduce385 reduces the 385-bit intermediate result in the passed terms modulo
// the group order in constant time and stores the result in s.
func ( *ModNScalar) (, , , , , , , , , , , ,  uint64) {
	// At this point, the intermediate result in the passed terms has been
	// reduced to fit within 385 bits, so reduce it again using the same method
	// described in reduce512.  As before, the intermediate result will end up
	// being reduced by another 127 bits to 258 bits, thus 9 32-bit terms are
	// needed for this iteration.  The reduced terms are assigned back to t0
	// through t8.
	//
	// Note that several of the intermediate calculations require adding 64-bit
	// products together which would overflow a uint64, so a 96-bit accumulator
	// is used instead until the value is reduced enough to use native uint64s.

	// Terms for 2^(32*0).
	var  accumulator96
	.n[0] = uint32() // == acc.Add(t0) because acc is guaranteed to be 0.
	.Add( * uint64(orderComplementWordZero))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*1).
	.Add()
	.Add( * uint64(orderComplementWordOne))
	.Add( * uint64(orderComplementWordZero))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*2).
	.Add()
	.Add( * uint64(orderComplementWordTwo))
	.Add( * uint64(orderComplementWordOne))
	.Add( * uint64(orderComplementWordZero))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*3).
	.Add()
	.Add( * uint64(orderComplementWordThree))
	.Add( * uint64(orderComplementWordTwo))
	.Add( * uint64(orderComplementWordOne))
	.Add( * uint64(orderComplementWordZero))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*4).
	.Add()
	.Add() // * uint64(orderComplementWordFour) // * 1
	.Add( * uint64(orderComplementWordThree))
	.Add( * uint64(orderComplementWordTwo))
	.Add( * uint64(orderComplementWordOne))
	.Add( * uint64(orderComplementWordZero))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*5).
	.Add()
	// acc.Add(t8 * uint64(orderComplementWordFive)) // 0
	.Add() // * uint64(orderComplementWordFour) // * 1
	.Add( * uint64(orderComplementWordThree))
	.Add( * uint64(orderComplementWordTwo))
	.Add( * uint64(orderComplementWordOne))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*6).
	.Add()
	// acc.Add(t8 * uint64(orderComplementWordSix)) // 0
	// acc.Add(t9 * uint64(orderComplementWordFive)) // 0
	.Add() // * uint64(orderComplementWordFour) // * 1
	.Add( * uint64(orderComplementWordThree))
	.Add( * uint64(orderComplementWordTwo))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*7).
	.Add()
	// acc.Add(t8 * uint64(orderComplementWordSeven)) // 0
	// acc.Add(t9 * uint64(orderComplementWordSix)) // 0
	// acc.Add(t10 * uint64(orderComplementWordFive)) // 0
	.Add() // * uint64(orderComplementWordFour) // * 1
	.Add( * uint64(orderComplementWordThree))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*8).
	// acc.Add(t9 * uint64(orderComplementWordSeven)) // 0
	// acc.Add(t10 * uint64(orderComplementWordSix)) // 0
	// acc.Add(t11 * uint64(orderComplementWordFive)) // 0
	.Add() // * uint64(orderComplementWordFour) // * 1
	 = uint64(.n[0])
	// acc.Rsh32() // No need since not used after this.  Guaranteed to be 0.

	// NOTE: All of the remaining multiplications for this iteration result in 0
	// as they all involve multiplying by combinations of the fifth, sixth, and
	// seventh words of the two's complement of N, which are 0, so skip them.

	// At this point, the result is reduced to fit within 258 bits, so reduce it
	// again using a slightly modified version of the same method.  The maximum
	// value in t8 is 2 at this point and therefore multiplying it by each word
	// of the two's complement of N and adding it to a 32-bit term will result
	// in a maximum requirement of 33 bits, so it is safe to use native uint64s
	// here for the intermediate term carry propagation.
	//
	// Also, since the maximum value in t8 is 2, this ends up reducing by
	// another 2 bits to 256 bits.
	 :=  + *uint64(orderComplementWordZero)
	.n[0] = uint32( & uint32Mask)
	 = ( >> 32) +  + *uint64(orderComplementWordOne)
	.n[1] = uint32( & uint32Mask)
	 = ( >> 32) +  + *uint64(orderComplementWordTwo)
	.n[2] = uint32( & uint32Mask)
	 = ( >> 32) +  + *uint64(orderComplementWordThree)
	.n[3] = uint32( & uint32Mask)
	 = ( >> 32) +  +  // * uint64(orderComplementWordFour) == * 1
	.n[4] = uint32( & uint32Mask)
	 = ( >> 32) +  // + t8*uint64(orderComplementWordFive) == 0
	.n[5] = uint32( & uint32Mask)
	 = ( >> 32) +  // + t8*uint64(orderComplementWordSix) == 0
	.n[6] = uint32( & uint32Mask)
	 = ( >> 32) +  // + t8*uint64(orderComplementWordSeven) == 0
	.n[7] = uint32( & uint32Mask)

	// The result is now 256 bits, but it might still be >= N, so use the
	// existing normal reduce method for 256-bit values.
	.reduce256(uint32(>>32) + .overflows())
}

// reduce512 reduces the 512-bit intermediate result in the passed terms modulo
// the group order down to 385 bits in constant time and stores the result in s.
func ( *ModNScalar) (, , , , , , , , , , , , , , ,  uint64) {
	// At this point, the intermediate result in the passed terms is grouped
	// into the respective bases.
	//
	// Per [HAC] section 14.3.4: Reduction method of moduli of special form,
	// when the modulus is of the special form m = b^t - c, where log_2(c) < t,
	// highly efficient reduction can be achieved per the provided algorithm.
	//
	// The secp256k1 group order fits this criteria since it is:
	//   2^256 - 432420386565659656852420866394968145599
	//
	// Technically the max possible value here is (N-1)^2 since the two scalars
	// being multiplied are always mod N.  Nevertheless, it is safer to consider
	// it to be (2^256-1)^2 = 2^512 - 2^257 + 1 since it is the product of two
	// 256-bit values.
	//
	// The algorithm is to reduce the result modulo the prime by subtracting
	// multiples of the group order N.  However, in order simplify carry
	// propagation, this adds with the two's complement of N to achieve the same
	// result.
	//
	// Since the two's complement of N has 127 leading zero bits, this will end
	// up reducing the intermediate result from 512 bits to 385 bits, resulting
	// in 13 32-bit terms.  The reduced terms are assigned back to t0 through
	// t12.
	//
	// Note that several of the intermediate calculations require adding 64-bit
	// products together which would overflow a uint64, so a 96-bit accumulator
	// is used instead.

	// Terms for 2^(32*0).
	var  accumulator96
	.n[0] = uint32() // == acc.Add(t0) because acc is guaranteed to be 0.
	.Add( * uint64(orderComplementWordZero))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*1).
	.Add()
	.Add( * uint64(orderComplementWordOne))
	.Add( * uint64(orderComplementWordZero))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*2).
	.Add()
	.Add( * uint64(orderComplementWordTwo))
	.Add( * uint64(orderComplementWordOne))
	.Add( * uint64(orderComplementWordZero))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*3).
	.Add()
	.Add( * uint64(orderComplementWordThree))
	.Add( * uint64(orderComplementWordTwo))
	.Add( * uint64(orderComplementWordOne))
	.Add( * uint64(orderComplementWordZero))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*4).
	.Add()
	.Add() // * uint64(orderComplementWordFour) // * 1
	.Add( * uint64(orderComplementWordThree))
	.Add( * uint64(orderComplementWordTwo))
	.Add( * uint64(orderComplementWordOne))
	.Add( * uint64(orderComplementWordZero))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*5).
	.Add()
	// acc.Add(t8 * uint64(orderComplementWordFive)) // 0
	.Add() // * uint64(orderComplementWordFour) // * 1
	.Add( * uint64(orderComplementWordThree))
	.Add( * uint64(orderComplementWordTwo))
	.Add( * uint64(orderComplementWordOne))
	.Add( * uint64(orderComplementWordZero))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*6).
	.Add()
	// acc.Add(t8 * uint64(orderComplementWordSix)) // 0
	// acc.Add(t9 * uint64(orderComplementWordFive)) // 0
	.Add() // * uint64(orderComplementWordFour)) // * 1
	.Add( * uint64(orderComplementWordThree))
	.Add( * uint64(orderComplementWordTwo))
	.Add( * uint64(orderComplementWordOne))
	.Add( * uint64(orderComplementWordZero))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*7).
	.Add()
	// acc.Add(t8 * uint64(orderComplementWordSeven)) // 0
	// acc.Add(t9 * uint64(orderComplementWordSix)) // 0
	// acc.Add(t10 * uint64(orderComplementWordFive)) // 0
	.Add() // * uint64(orderComplementWordFour) // * 1
	.Add( * uint64(orderComplementWordThree))
	.Add( * uint64(orderComplementWordTwo))
	.Add( * uint64(orderComplementWordOne))
	.Add( * uint64(orderComplementWordZero))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*8).
	// acc.Add(t9 * uint64(orderComplementWordSeven)) // 0
	// acc.Add(t10 * uint64(orderComplementWordSix)) // 0
	// acc.Add(t11 * uint64(orderComplementWordFive)) // 0
	.Add() // * uint64(orderComplementWordFour) // * 1
	.Add( * uint64(orderComplementWordThree))
	.Add( * uint64(orderComplementWordTwo))
	.Add( * uint64(orderComplementWordOne))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*9).
	// acc.Add(t10 * uint64(orderComplementWordSeven)) // 0
	// acc.Add(t11 * uint64(orderComplementWordSix)) // 0
	// acc.Add(t12 * uint64(orderComplementWordFive)) // 0
	.Add() // * uint64(orderComplementWordFour) // * 1
	.Add( * uint64(orderComplementWordThree))
	.Add( * uint64(orderComplementWordTwo))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*10).
	// acc.Add(t11 * uint64(orderComplementWordSeven)) // 0
	// acc.Add(t12 * uint64(orderComplementWordSix)) // 0
	// acc.Add(t13 * uint64(orderComplementWordFive)) // 0
	.Add() // * uint64(orderComplementWordFour) // * 1
	.Add( * uint64(orderComplementWordThree))
	 = uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*11).
	// acc.Add(t12 * uint64(orderComplementWordSeven)) // 0
	// acc.Add(t13 * uint64(orderComplementWordSix)) // 0
	// acc.Add(t14 * uint64(orderComplementWordFive)) // 0
	.Add() // * uint64(orderComplementWordFour) // * 1
	 = uint64(.n[0])
	.Rsh32()

	// NOTE: All of the remaining multiplications for this iteration result in 0
	// as they all involve multiplying by combinations of the fifth, sixth, and
	// seventh words of the two's complement of N, which are 0, so skip them.

	// Terms for 2^(32*12).
	 = uint64(.n[0])
	// acc.Rsh32() // No need since not used after this.  Guaranteed to be 0.

	// At this point, the result is reduced to fit within 385 bits, so reduce it
	// again using the same method accordingly.
	.reduce385(, , , , , , , , , , , , )
}

// Mul2 multiplies the passed two scalars together modulo the group order in
// constant time and stores the result in s.
//
// The scalar is returned to support chaining.  This enables syntax like:
// s3.Mul2(s, s2).AddInt(1) so that s3 = (s * s2) + 1.
func ( *ModNScalar) (,  *ModNScalar) *ModNScalar {
	// This could be done with for loops and an array to store the intermediate
	// terms, but this unrolled version is significantly faster.

	// The overall strategy employed here is:
	// 1) Calculate the 512-bit product of the two scalars using the standard
	//    pencil-and-paper method.
	// 2) Reduce the result modulo the prime by effectively subtracting
	//    multiples of the group order N (actually performed by adding multiples
	//    of the two's complement of N to avoid implementing subtraction).
	// 3) Repeat step 2 noting that each iteration reduces the required number
	//    of bits by 127 because the two's complement of N has 127 leading zero
	//    bits.
	// 4) Once reduced to 256 bits, call the existing reduce method to perform
	//    a final reduction as needed.
	//
	// Note that several of the intermediate calculations require adding 64-bit
	// products together which would overflow a uint64, so a 96-bit accumulator
	// is used instead.

	// Terms for 2^(32*0).
	var  accumulator96
	.Add(uint64(.n[0]) * uint64(.n[0]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*1).
	.Add(uint64(.n[0]) * uint64(.n[1]))
	.Add(uint64(.n[1]) * uint64(.n[0]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*2).
	.Add(uint64(.n[0]) * uint64(.n[2]))
	.Add(uint64(.n[1]) * uint64(.n[1]))
	.Add(uint64(.n[2]) * uint64(.n[0]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*3).
	.Add(uint64(.n[0]) * uint64(.n[3]))
	.Add(uint64(.n[1]) * uint64(.n[2]))
	.Add(uint64(.n[2]) * uint64(.n[1]))
	.Add(uint64(.n[3]) * uint64(.n[0]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*4).
	.Add(uint64(.n[0]) * uint64(.n[4]))
	.Add(uint64(.n[1]) * uint64(.n[3]))
	.Add(uint64(.n[2]) * uint64(.n[2]))
	.Add(uint64(.n[3]) * uint64(.n[1]))
	.Add(uint64(.n[4]) * uint64(.n[0]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*5).
	.Add(uint64(.n[0]) * uint64(.n[5]))
	.Add(uint64(.n[1]) * uint64(.n[4]))
	.Add(uint64(.n[2]) * uint64(.n[3]))
	.Add(uint64(.n[3]) * uint64(.n[2]))
	.Add(uint64(.n[4]) * uint64(.n[1]))
	.Add(uint64(.n[5]) * uint64(.n[0]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*6).
	.Add(uint64(.n[0]) * uint64(.n[6]))
	.Add(uint64(.n[1]) * uint64(.n[5]))
	.Add(uint64(.n[2]) * uint64(.n[4]))
	.Add(uint64(.n[3]) * uint64(.n[3]))
	.Add(uint64(.n[4]) * uint64(.n[2]))
	.Add(uint64(.n[5]) * uint64(.n[1]))
	.Add(uint64(.n[6]) * uint64(.n[0]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*7).
	.Add(uint64(.n[0]) * uint64(.n[7]))
	.Add(uint64(.n[1]) * uint64(.n[6]))
	.Add(uint64(.n[2]) * uint64(.n[5]))
	.Add(uint64(.n[3]) * uint64(.n[4]))
	.Add(uint64(.n[4]) * uint64(.n[3]))
	.Add(uint64(.n[5]) * uint64(.n[2]))
	.Add(uint64(.n[6]) * uint64(.n[1]))
	.Add(uint64(.n[7]) * uint64(.n[0]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*8).
	.Add(uint64(.n[1]) * uint64(.n[7]))
	.Add(uint64(.n[2]) * uint64(.n[6]))
	.Add(uint64(.n[3]) * uint64(.n[5]))
	.Add(uint64(.n[4]) * uint64(.n[4]))
	.Add(uint64(.n[5]) * uint64(.n[3]))
	.Add(uint64(.n[6]) * uint64(.n[2]))
	.Add(uint64(.n[7]) * uint64(.n[1]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*9).
	.Add(uint64(.n[2]) * uint64(.n[7]))
	.Add(uint64(.n[3]) * uint64(.n[6]))
	.Add(uint64(.n[4]) * uint64(.n[5]))
	.Add(uint64(.n[5]) * uint64(.n[4]))
	.Add(uint64(.n[6]) * uint64(.n[3]))
	.Add(uint64(.n[7]) * uint64(.n[2]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*10).
	.Add(uint64(.n[3]) * uint64(.n[7]))
	.Add(uint64(.n[4]) * uint64(.n[6]))
	.Add(uint64(.n[5]) * uint64(.n[5]))
	.Add(uint64(.n[6]) * uint64(.n[4]))
	.Add(uint64(.n[7]) * uint64(.n[3]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*11).
	.Add(uint64(.n[4]) * uint64(.n[7]))
	.Add(uint64(.n[5]) * uint64(.n[6]))
	.Add(uint64(.n[6]) * uint64(.n[5]))
	.Add(uint64(.n[7]) * uint64(.n[4]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*12).
	.Add(uint64(.n[5]) * uint64(.n[7]))
	.Add(uint64(.n[6]) * uint64(.n[6]))
	.Add(uint64(.n[7]) * uint64(.n[5]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*13).
	.Add(uint64(.n[6]) * uint64(.n[7]))
	.Add(uint64(.n[7]) * uint64(.n[6]))
	 := uint64(.n[0])
	.Rsh32()

	// Terms for 2^(32*14).
	.Add(uint64(.n[7]) * uint64(.n[7]))
	 := uint64(.n[0])
	.Rsh32()

	// What's left is for 2^(32*15).
	 := uint64(.n[0])
	// acc.Rsh32() // No need since not used after this.  Guaranteed to be 0.

	// At this point, all of the terms are grouped into their respective base
	// and occupy up to 512 bits.  Reduce the result accordingly.
	.reduce512(, , , , , , , , , , , , , , ,
		)
	return 
}

// Mul multiplies the passed scalar with the existing one modulo the group order
// in constant time and stores the result in s.
//
// The scalar is returned to support chaining.  This enables syntax like:
// s.Mul(s2).AddInt(1) so that s = (s * s2) + 1.
func ( *ModNScalar) ( *ModNScalar) *ModNScalar {
	return .Mul2(, )
}

// SquareVal squares the passed scalar modulo the group order in constant time
// and stores the result in s.
//
// The scalar is returned to support chaining.  This enables syntax like:
// s3.SquareVal(s).Mul(s) so that s3 = s^2 * s = s^3.
func ( *ModNScalar) ( *ModNScalar) *ModNScalar {
	// This could technically be optimized slightly to take advantage of the
	// fact that many of the intermediate calculations in squaring are just
	// doubling, however, benchmarking has shown that due to the need to use a
	// 96-bit accumulator, any savings are essentially offset by that and
	// consequently there is no real difference in performance over just
	// multiplying the value by itself to justify the extra code for now.  This
	// can be revisited in the future if it becomes a bottleneck in practice.

	return .Mul2(, )
}

// Square squares the scalar modulo the group order in constant time.  The
// existing scalar is modified.
//
// The scalar is returned to support chaining.  This enables syntax like:
// s.Square().Mul(s2) so that s = s^2 * s2.
func ( *ModNScalar) () *ModNScalar {
	return .SquareVal()
}

// NegateVal negates the passed scalar modulo the group order and stores the
// result in s in constant time.
//
// The scalar is returned to support chaining.  This enables syntax like:
// s.NegateVal(s2).AddInt(1) so that s = -s2 + 1.
func ( *ModNScalar) ( *ModNScalar) *ModNScalar {
	// Since the scalar is already in the range 0 <= val < N, where N is the
	// group order, negation modulo the group order is just the group order
	// minus the value.  This implies that the result will always be in the
	// desired range with the sole exception of 0 because N - 0 = N itself.
	//
	// Therefore, in order to avoid the need to reduce the result for every
	// other case in order to achieve constant time, this creates a mask that is
	// all 0s in the case of the scalar being negated is 0 and all 1s otherwise
	// and bitwise ands that mask with each word.
	//
	// Finally, to simplify the carry propagation, this adds the two's
	// complement of the scalar to N in order to achieve the same result.
	 := .n[0] | .n[1] | .n[2] | .n[3] | .n[4] | .n[5] |
		.n[6] | .n[7]
	 := uint64(uint32Mask * constantTimeNotEq(, 0))
	 := uint64(orderWordZero) + (uint64(^.n[0]) + 1)
	.n[0] = uint32( & )
	 = ( >> 32) + uint64(orderWordOne) + uint64(^.n[1])
	.n[1] = uint32( & )
	 = ( >> 32) + uint64(orderWordTwo) + uint64(^.n[2])
	.n[2] = uint32( & )
	 = ( >> 32) + uint64(orderWordThree) + uint64(^.n[3])
	.n[3] = uint32( & )
	 = ( >> 32) + uint64(orderWordFour) + uint64(^.n[4])
	.n[4] = uint32( & )
	 = ( >> 32) + uint64(orderWordFive) + uint64(^.n[5])
	.n[5] = uint32( & )
	 = ( >> 32) + uint64(orderWordSix) + uint64(^.n[6])
	.n[6] = uint32( & )
	 = ( >> 32) + uint64(orderWordSeven) + uint64(^.n[7])
	.n[7] = uint32( & )
	return 
}

// Negate negates the scalar modulo the group order in constant time.  The
// existing scalar is modified.
//
// The scalar is returned to support chaining.  This enables syntax like:
// s.Negate().AddInt(1) so that s = -s + 1.
func ( *ModNScalar) () *ModNScalar {
	return .NegateVal()
}

// InverseValNonConst finds the modular multiplicative inverse of the passed
// scalar and stores result in s in *non-constant* time.
//
// The scalar is returned to support chaining.  This enables syntax like:
// s3.InverseVal(s1).Mul(s2) so that s3 = s1^-1 * s2.
func ( *ModNScalar) ( *ModNScalar) *ModNScalar {
	// This is making use of big integers for now.  Ideally it will be replaced
	// with an implementation that does not depend on big integers.
	 := .Bytes()
	 := new(big.Int).SetBytes([:])
	.ModInverse(, curveParams.N)
	.SetByteSlice(.Bytes())
	return 
}

// InverseNonConst finds the modular multiplicative inverse of the scalar in
// *non-constant* time.  The existing scalar is modified.
//
// The scalar is returned to support chaining.  This enables syntax like:
// s.Inverse().Mul(s2) so that s = s^-1 * s2.
func ( *ModNScalar) () *ModNScalar {
	return .InverseValNonConst()
}

// IsOverHalfOrder returns whether or not the scalar exceeds the group order
// divided by 2 in constant time.
func ( *ModNScalar) () bool {
	// The intuition here is that the scalar is greater than half of the group
	// order if one of the higher individual words is greater than the
	// corresponding word of the half group order and all higher words in the
	// scalar are equal to their corresponding word of the half group order.
	//
	// Note that the words 4, 5, and 6 are all the max uint32 value, so there is
	// no need to test if those individual words of the scalar exceeds them,
	// hence, only equality is checked for them.
	 := constantTimeGreater(.n[7], halfOrderWordSeven)
	 := constantTimeEq(.n[7], halfOrderWordSeven)
	 &= constantTimeEq(.n[6], halfOrderWordSix)
	 &= constantTimeEq(.n[5], halfOrderWordFive)
	 &= constantTimeEq(.n[4], halfOrderWordFour)
	 |=  & constantTimeGreater(.n[3], halfOrderWordThree)
	 &= constantTimeEq(.n[3], halfOrderWordThree)
	 |=  & constantTimeGreater(.n[2], halfOrderWordTwo)
	 &= constantTimeEq(.n[2], halfOrderWordTwo)
	 |=  & constantTimeGreater(.n[1], halfOrderWordOne)
	 &= constantTimeEq(.n[1], halfOrderWordOne)
	 |=  & constantTimeGreater(.n[0], halfOrderWordZero)

	return  != 0
}