package amd64

import (
	

	
	
)

var swizzleMask = [16]byte{
	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
	0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
}

func ( *machine) (,  ssa.Value,  ssa.Value) {
	 := .getOrAllocateConstLabel(&.constSwizzleMaskConstIndex, swizzleMask[:])

	// Load mask to maskReg.
	 := .c.AllocateVReg(ssa.TypeV128)
	 := .allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), )
	.insert()

	// Copy x and y to tmp registers.
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .copyToTmp(.reg())
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .copyToTmp(.reg())

	.insert(.allocateInstr().asXmmRmR(sseOpcodePaddusb, newOperandReg(), ))
	.insert(.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(), ))

	// Copy the result to the destination register.
	.copyTo(, .c.VRegOf())
}

func ( *machine) (,  ssa.Value,  byte,  ssa.Value,  ssa.VecLane) {
	// Copy x to tmp.
	 := .c.AllocateVReg(ssa.TypeV128)
	.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, .getOperand_Mem_Reg(.c.ValueDefinition()), ))

	 := .getOperand_Reg(.c.ValueDefinition())
	switch  {
	case ssa.VecLaneI8x16:
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, , , ))
	case ssa.VecLaneI16x8:
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, , , ))
	case ssa.VecLaneI32x4:
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, , , ))
	case ssa.VecLaneI64x2:
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, , , ))
	case ssa.VecLaneF32x4:
		// In INSERTPS instruction, the destination index is encoded at 4 and 5 bits of the argument.
		// See https://www.felixcloutier.com/x86/insertps
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, <<4, , ))
	case ssa.VecLaneF64x2:
		if  == 0 {
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, , ))
		} else {
			.insert(.allocateInstr().asXmmRmR(sseOpcodeMovlhps, , ))
		}
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}

	.copyTo(, .c.VRegOf())
}

func ( *machine) ( ssa.Value,  byte,  bool,  ssa.Value,  ssa.VecLane) {
	// Pextr variants are used to extract a lane from a vector register.
	 := .getOperand_Reg(.c.ValueDefinition())

	 := .c.AllocateVReg(.Type())
	.insert(.allocateInstr().asDefineUninitializedReg())
	switch  {
	case ssa.VecLaneI8x16:
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePextrb, , , ))
		if  {
			.insert(.allocateInstr().asMovsxRmR(extModeBL, newOperandReg(), ))
		} else {
			.insert(.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(), ))
		}
	case ssa.VecLaneI16x8:
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePextrw, , , ))
		if  {
			.insert(.allocateInstr().asMovsxRmR(extModeWL, newOperandReg(), ))
		} else {
			.insert(.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(), ))
		}
	case ssa.VecLaneI32x4:
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePextrd, , , ))
	case ssa.VecLaneI64x2:
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePextrq, , , ))
	case ssa.VecLaneF32x4:
		if  == 0 {
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovss, , ))
		} else {
			.insert(.allocateInstr().asXmmRmRImm(sseOpcodePshufd, , , ))
		}
	case ssa.VecLaneF64x2:
		if  == 0 {
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, , ))
		} else {
			.copyTo(.reg(), )
			.insert(.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0b00_00_11_10, newOperandReg(), ))
		}
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}

	.copyTo(, .c.VRegOf())
}

var sqmulRoundSat = [16]byte{
	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
}

func ( *machine) (, ,  ssa.Value) {
	// See https://github.com/WebAssembly/simd/pull/365 for the following logic.
	 := .getOrAllocateConstLabel(&.constSqmulRoundSatIndex, sqmulRoundSat[:])

	 := .c.AllocateVReg(ssa.TypeV128)
	 := .allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), )
	.insert()

	,  := .getOperand_Reg(.c.ValueDefinition()), .getOperand_Mem_Reg(.c.ValueDefinition())
	 := .copyToTmp(.reg())

	.insert(.allocateInstr().asXmmRmR(sseOpcodePmulhrsw, , ))
	.insert(.allocateInstr().asXmmRmR(sseOpcodePcmpeqw, newOperandReg(), ))
	.insert(.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(), ))

	.copyTo(, .c.VRegOf())
}

func ( *machine) (, ,  ssa.Value,  ssa.VecLane) {
	switch  {
	case ssa.VecLaneI8x16:
		.lowerVUshri8x16(, , )
	case ssa.VecLaneI16x8, ssa.VecLaneI32x4, ssa.VecLaneI64x2:
		.lowerShr(, , , , false)
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}
}

// i8x16LogicalSHRMaskTable is necessary for emulating non-existent packed bytes logical right shifts on amd64.
// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
var i8x16LogicalSHRMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
	0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, // for 1 shift
	0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, // for 2 shift
	0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, // for 3 shift
	0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, // for 4 shift
	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, // for 5 shift
	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, // for 6 shift
	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, // for 7 shift
}

func ( *machine) (, ,  ssa.Value) {
	 := .c.AllocateVReg(ssa.TypeI32)
	// Load the modulo 8 mask to tmpReg.
	.lowerIconst(, 0x7, false)
	// Take the modulo 8 of the shift amount.
	 := .getOperand_Mem_Imm32_Reg(.c.ValueDefinition())
	.insert(.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, , , false))

	 := .getOperand_Reg(.c.ValueDefinition())
	 := .copyToTmp(.reg())

	 := .c.AllocateVReg(ssa.TypeV128)
	.insert(.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(), , false))
	.insert(.allocateInstr().asXmmRmiReg(sseOpcodePsrlw, newOperandReg(), ))

	 := .getOrAllocateConstLabel(&.constI8x16LogicalSHRMaskTableIndex, i8x16LogicalSHRMaskTable[:])
	 := .c.AllocateVReg(ssa.TypeI64)
	 := .allocateInstr().asLEA(newOperandLabel(), )
	.insert()

	// Shift tmpGpReg by 4 to multiply the shift amount by 16.
	.insert(.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), , false))

	 := .newAmodeRegRegShift(0, , , 0)
	 := .allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(), )
	.insert()

	.insert(.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(), ))
	.copyTo(, .c.VRegOf())
}

func ( *machine) (, ,  ssa.Value,  ssa.VecLane) {
	switch  {
	case ssa.VecLaneI8x16:
		.lowerVSshri8x16(, , )
	case ssa.VecLaneI16x8, ssa.VecLaneI32x4:
		.lowerShr(, , , , true)
	case ssa.VecLaneI64x2:
		.lowerVSshri64x2(, , )
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}
}

func ( *machine) (, ,  ssa.Value) {
	 := .c.AllocateVReg(ssa.TypeI32)
	// Load the modulo 8 mask to tmpReg.
	.lowerIconst(, 0x7, false)
	// Take the modulo 8 of the shift amount.
	 := .getOperand_Mem_Imm32_Reg(.c.ValueDefinition())
	.insert(.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, , , false))

	// Copy the x value to two temporary registers.
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .copyToTmp(.reg())
	 := .c.AllocateVReg(ssa.TypeV128)
	.copyTo(, )

	// Assuming that we have
	//  xx   = [b1, ..., b16]
	//  vecTmp = [b1, ..., b16]
	// at this point, then we use PUNPCKLBW and PUNPCKHBW to produce:
	//  xx   = [b1, b1, b2, b2, ..., b8, b8]
	//  vecTmp = [b9, b9, b10, b10, ..., b16, b16]
	.insert(.allocateInstr().asXmmRmR(sseOpcodePunpcklbw, newOperandReg(), ))
	.insert(.allocateInstr().asXmmRmR(sseOpcodePunpckhbw, newOperandReg(), ))

	// Adding 8 to the shift amount, and then move the amount to vecTmp2.
	 := .c.AllocateVReg(ssa.TypeV128)
	.insert(.allocateInstr().asAluRmiR(aluRmiROpcodeAdd, newOperandImm32(8), , false))
	.insert(.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(), , false))

	// Perform the word packed arithmetic right shifts on vreg and vecTmp.
	// This changes these two registers as:
	//  xx   = [xxx, b1 >> s, xxx, b2 >> s, ..., xxx, b8 >> s]
	//  vecTmp = [xxx, b9 >> s, xxx, b10 >> s, ..., xxx, b16 >> s]
	// where xxx is 1 or 0 depending on each byte's sign, and ">>" is the arithmetic shift on a byte.
	.insert(.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(), ))
	.insert(.allocateInstr().asXmmRmiReg(sseOpcodePsraw, newOperandReg(), ))

	// Finally, we can get the result by packing these two word vectors.
	.insert(.allocateInstr().asXmmRmR(sseOpcodePacksswb, newOperandReg(), ))

	.copyTo(, .c.VRegOf())
}

func ( *machine) (, ,  ssa.Value) {
	// Load the shift amount to RCX.
	 := .getOperand_Mem_Reg(.c.ValueDefinition())
	.insert(.allocateInstr().asMovzxRmR(extModeBQ, , rcxVReg))

	 := .c.AllocateVReg(ssa.TypeI64)

	 := .getOperand_Reg(.c.ValueDefinition())
	 := .copyToTmp(.reg())

	.insert(.allocateInstr().asDefineUninitializedReg())
	.insert(.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 0, newOperandReg(), ))
	.insert(.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), , true))
	.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(), ))
	.insert(.allocateInstr().asXmmRmRImm(sseOpcodePextrq, 1, newOperandReg(), ))
	.insert(.allocateInstr().asShiftR(shiftROpShiftRightArithmetic, newOperandReg(rcxVReg), , true))
	.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(), ))

	.copyTo(, .c.VRegOf())
}

func ( *machine) (, ,  ssa.Value,  ssa.VecLane,  bool) {
	var  uint64
	var  sseOpcode
	switch  {
	case ssa.VecLaneI16x8:
		 = 0xf
		if  {
			 = sseOpcodePsraw
		} else {
			 = sseOpcodePsrlw
		}
	case ssa.VecLaneI32x4:
		 = 0x1f
		if  {
			 = sseOpcodePsrad
		} else {
			 = sseOpcodePsrld
		}
	case ssa.VecLaneI64x2:
		 = 0x3f
		if  {
			panic("BUG")
		}
		 = sseOpcodePsrlq
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}

	 := .getOperand_Reg(.c.ValueDefinition())
	 := .copyToTmp(.reg())

	 := .c.AllocateVReg(ssa.TypeI32)
	// Load the modulo 8 mask to tmpReg.
	.lowerIconst(, , false)
	// Take the modulo 8 of the shift amount.
	.insert(.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
		.getOperand_Mem_Imm32_Reg(.c.ValueDefinition()), , false))
	// And move it to a xmm register.
	 := .c.AllocateVReg(ssa.TypeV128)
	.insert(.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(), , false))

	// Then do the actual shift.
	.insert(.allocateInstr().asXmmRmiReg(, newOperandReg(), ))

	.copyTo(, .c.VRegOf())
}

func ( *machine) (, ,  ssa.Value,  ssa.VecLane) {
	var  uint64
	var  sseOpcode
	var  bool
	switch  {
	case ssa.VecLaneI8x16:
		 = true
		 = 0x7
		 = sseOpcodePsllw
	case ssa.VecLaneI16x8:
		 = 0xf
		 = sseOpcodePsllw
	case ssa.VecLaneI32x4:
		 = 0x1f
		 = sseOpcodePslld
	case ssa.VecLaneI64x2:
		 = 0x3f
		 = sseOpcodePsllq
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}

	 := .getOperand_Reg(.c.ValueDefinition())
	 := .copyToTmp(.reg())

	 := .c.AllocateVReg(ssa.TypeI32)
	// Load the modulo 8 mask to tmpReg.
	.lowerIconst(, , false)
	// Take the modulo 8 of the shift amount.
	.insert(.allocateInstr().asAluRmiR(aluRmiROpcodeAnd,
		.getOperand_Mem_Imm32_Reg(.c.ValueDefinition()), , false))
	// And move it to a xmm register.
	 := .c.AllocateVReg(ssa.TypeV128)
	.insert(.allocateInstr().asGprToXmm(sseOpcodeMovd, newOperandReg(), , false))

	// Then do the actual shift.
	.insert(.allocateInstr().asXmmRmiReg(, newOperandReg(), ))

	if  {
		 := .getOrAllocateConstLabel(&.constI8x16SHLMaskTableIndex, i8x16SHLMaskTable[:])
		 := .c.AllocateVReg(ssa.TypeI64)
		 := .allocateInstr().asLEA(newOperandLabel(), )
		.insert()

		// Shift tmpGpReg by 4 to multiply the shift amount by 16.
		.insert(.allocateInstr().asShiftR(shiftROpShiftLeft, newOperandImm32(4), , false))

		 := .newAmodeRegRegShift(0, , , 0)
		 := .allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(), )
		.insert()

		.insert(.allocateInstr().asXmmRmR(sseOpcodePand, newOperandReg(), ))
	}

	.copyTo(, .c.VRegOf())
}

// i8x16SHLMaskTable is necessary for emulating non-existent packed bytes left shifts on amd64.
// The mask is applied after performing packed word shifts on the value to clear out the unnecessary bits.
var i8x16SHLMaskTable = [8 * 16]byte{ // (the number of possible shift amount 0, 1, ..., 7.) * 16 bytes.
	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, // for 0 shift
	0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, // for 1 shift
	0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, // for 2 shift
	0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, // for 3 shift
	0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, // for 4 shift
	0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, // for 5 shift
	0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, // for 6 shift
	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, // for 7 shift
}

func ( *machine) (,  ssa.Value,  byte,  bool) {
	 := .getOperand_Mem_Reg(.c.ValueDefinition())
	var  sseOpcode
	if  {
		 = sseOpcodeRoundpd
	} else {
		 = sseOpcodeRoundps
	}
	.insert(.allocateInstr().asXmmUnaryRmRImm(, , , .c.VRegOf()))
}

var (
	allOnesI8x16              = [16]byte{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1}
	allOnesI16x8              = [16]byte{0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0, 0x1, 0x0}
	extAddPairwiseI16x8uMask1 = [16]byte{0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80}
	extAddPairwiseI16x8uMask2 = [16]byte{0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00}
)

func ( *machine) (,  ssa.Value,  ssa.VecLane,  bool) {
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .copyToTmp(.reg())
	switch  {
	case ssa.VecLaneI8x16:
		 := .c.AllocateVReg(ssa.TypeV128)
		 := .getOrAllocateConstLabel(&.constAllOnesI8x16Index, allOnesI8x16[:])
		.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), ))

		var  regalloc.VReg
		if  {
			 = 
			.insert(.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(), ))
		} else {
			// Interpreter tmp (all ones) as signed byte meaning that all the multiply-add is unsigned.
			 = 
			.insert(.allocateInstr().asXmmRmR(sseOpcodePmaddubsw, newOperandReg(), ))
		}
		.copyTo(, .c.VRegOf())

	case ssa.VecLaneI16x8:
		if  {
			 := .c.AllocateVReg(ssa.TypeV128)
			 := .getOrAllocateConstLabel(&.constAllOnesI16x8Index, allOnesI16x8[:])
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), ))
			.insert(.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(), ))
			.copyTo(, .c.VRegOf())
		} else {
			 := .c.AllocateVReg(ssa.TypeV128)
			 := .getOrAllocateConstLabel(&.constExtAddPairwiseI16x8uMask1Index, extAddPairwiseI16x8uMask1[:])
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), ))

			// Flip the sign bits on xx.
			//
			// Assuming that xx = [w1, ..., w8], now we have,
			// 	xx[i] = int8(-w1) for i = 0...8
			.insert(.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(), ))

			 = .getOrAllocateConstLabel(&.constAllOnesI16x8Index, allOnesI16x8[:])
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), ))

			// For i = 0,..4 (as this results in i32x4 lanes), now we have
			// xx[i] = int32(-wn + -w(n+1)) = int32(-(wn + w(n+1)))
			// c.assembler.CompileRegisterToRegister(amd64.PMADDWD, tmp, vr)
			.insert(.allocateInstr().asXmmRmR(sseOpcodePmaddwd, newOperandReg(), ))

			 = .getOrAllocateConstLabel(&.constExtAddPairwiseI16x8uMask2Index, extAddPairwiseI16x8uMask2[:])
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), ))

			// vr[i] = int32(-(wn + w(n+1))) + int32(math.MaxInt16+1) = int32((wn + w(n+1))) = uint32(wn + w(n+1)).
			// c.assembler.CompileRegisterToRegister(amd64.PADDD, tmp, vr)
			.insert(.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(), ))

			.copyTo(, .c.VRegOf())
		}
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}
}

func ( *machine) (,  ssa.Value,  ssa.VecLane,  bool) {
	var  sseOpcode
	switch  {
	case ssa.VecLaneI8x16:
		if  {
			 = sseOpcodePmovsxbw
		} else {
			 = sseOpcodePmovzxbw
		}
	case ssa.VecLaneI16x8:
		if  {
			 = sseOpcodePmovsxwd
		} else {
			 = sseOpcodePmovzxwd
		}
	case ssa.VecLaneI32x4:
		if  {
			 = sseOpcodePmovsxdq
		} else {
			 = sseOpcodePmovzxdq
		}
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}

	 := .getOperand_Mem_Reg(.c.ValueDefinition())
	.insert(.allocateInstr().asXmmUnaryRmR(, , .c.VRegOf()))
}

func ( *machine) (,  ssa.Value,  ssa.VecLane,  bool) {
	 := .c.AllocateVReg(ssa.TypeV128)
	 := .getOperand_Reg(.c.ValueDefinition())
	.copyTo(.reg(), )
	.insert(.allocateInstr().asXmmRmRImm(sseOpcodePalignr, 8, newOperandReg(), ))

	var  sseOpcode
	switch  {
	case ssa.VecLaneI8x16:
		if  {
			 = sseOpcodePmovsxbw
		} else {
			 = sseOpcodePmovzxbw
		}
	case ssa.VecLaneI16x8:
		if  {
			 = sseOpcodePmovsxwd
		} else {
			 = sseOpcodePmovzxwd
		}
	case ssa.VecLaneI32x4:
		if  {
			 = sseOpcodePmovsxdq
		} else {
			 = sseOpcodePmovzxdq
		}
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}

	.insert(.allocateInstr().asXmmUnaryRmR(, newOperandReg(), .c.VRegOf()))
}

func ( *machine) ( ssa.Value,  uint32,  ssa.Value,  ssa.VecLane) {
	,  := .c.AllocateVReg(ssa.TypeV128), .c.AllocateVReg(ssa.TypeI64)
	 := newOperandMem(.lowerToAddressMode(, ))

	.insert(.allocateInstr().asDefineUninitializedReg())
	switch  {
	case ssa.VecLaneI8x16:
		.insert(.allocateInstr().asMovzxRmR(extModeBQ, , ))
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, newOperandReg(), ))
		 := .c.AllocateVReg(ssa.TypeV128)
		.insert(.allocateInstr().asZeros())
		.insert(.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(), ))
	case ssa.VecLaneI16x8:
		.insert(.allocateInstr().asMovzxRmR(extModeWQ, , ))
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, newOperandReg(), ))
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, newOperandReg(), ))
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(), ))
	case ssa.VecLaneI32x4:
		.insert(.allocateInstr().asMovzxRmR(extModeLQ, , ))
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, newOperandReg(), ))
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(), ))
	case ssa.VecLaneI64x2:
		.insert(.allocateInstr().asMov64MR(, ))
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, newOperandReg(), ))
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, newOperandReg(), ))
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}

	.copyTo(, .c.VRegOf())
}

var f64x2CvtFromIMask = [16]byte{
	0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
}

func ( *machine) (,  ssa.Value,  ssa.VecLane,  bool) {
	switch  {
	case ssa.VecLaneF32x4:
		if  {
			 := .getOperand_Reg(.c.ValueDefinition())
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, , .c.VRegOf()))
		} else {
			 := .getOperand_Reg(.c.ValueDefinition())
			// Copy the value to two temporary registers.
			 := .copyToTmp(.reg())
			 := .copyToTmp(.reg())

			// Clear the higher 16 bits of each 32-bit element.
			.insert(.allocateInstr().asXmmRmiReg(sseOpcodePslld, newOperandImm32(0xa), ))
			.insert(.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0xa), ))

			// Subtract the higher 16-bits from tmp2: clear the lower 16-bits of tmp2.
			.insert(.allocateInstr().asXmmRmR(sseOpcodePsubd, newOperandReg(), ))

			// Convert the lower 16-bits in tmp.
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(), ))

			// Left shift by one and convert tmp2, meaning that halved conversion result of higher 16-bits in tmp2.
			.insert(.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), ))
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(), ))

			// Double the converted halved higher 16bits.
			.insert(.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(), ))

			// Get the conversion result by add tmp (holding lower 16-bit conversion) into tmp2.
			.insert(.allocateInstr().asXmmRmR(sseOpcodeAddps, newOperandReg(), ))

			.copyTo(, .c.VRegOf())
		}
	case ssa.VecLaneF64x2:
		if  {
			 := .getOperand_Mem_Reg(.c.ValueDefinition())
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2pd, , .c.VRegOf()))
		} else {
			 := .c.AllocateVReg(ssa.TypeV128)
			 := .getOrAllocateConstLabel(&.constF64x2CvtFromIMaskIndex, f64x2CvtFromIMask[:])
			// maskReg = [0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), ))

			 := .getOperand_Reg(.c.ValueDefinition())
			 := .copyToTmp(.reg())

			// Given that we have xx = [d1, d2, d3, d4], this results in
			//	xx = [d1, [0x00, 0x00, 0x30, 0x43], d2, [0x00, 0x00, 0x30, 0x43]]
			//     = [float64(uint32(d1)) + 0x1.0p52, float64(uint32(d2)) + 0x1.0p52]
			//     ^See https://stackoverflow.com/questions/13269523/can-all-32-bit-ints-be-exactly-represented-as-a-double
			.insert(.allocateInstr().asXmmRmR(sseOpcodeUnpcklps, newOperandReg(), ))

			// maskReg = [float64(0x1.0p52), float64(0x1.0p52)]
			 = .getOrAllocateConstLabel(&.constTwop52Index, twop52[:])
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), ))

			// Now, we get the result as
			// 	xx = [float64(uint32(d1)), float64(uint32(d2))]
			// because the following equality always satisfies:
			//  float64(0x1.0p52 + float64(uint32(x))) - float64(0x1.0p52 + float64(uint32(y))) = float64(uint32(x)) - float64(uint32(y))
			.insert(.allocateInstr().asXmmRmR(sseOpcodeSubpd, newOperandReg(), ))

			.copyTo(, .c.VRegOf())
		}
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}
}

var (
	// i32sMaxOnF64x2 holds math.MaxInt32(=2147483647.0) on two f64 lanes.
	i32sMaxOnF64x2 = [16]byte{
		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
		0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0xdf, 0x41, // float64(2147483647.0)
	}

	// i32sMaxOnF64x2 holds math.MaxUint32(=4294967295.0) on two f64 lanes.
	i32uMaxOnF64x2 = [16]byte{
		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
		0x00, 0x00, 0xe0, 0xff, 0xff, 0xff, 0xef, 0x41, // float64(4294967295.0)
	}

	// twop52 holds two float64(0x1.0p52) on two f64 lanes. 0x1.0p52 is special in the sense that
	// with this exponent, the mantissa represents a corresponding uint32 number, and arithmetics,
	// like addition or subtraction, the resulted floating point holds exactly the same
	// bit representations in 32-bit integer on its mantissa.
	//
	// Note: the name twop52 is common across various compiler ecosystem.
	// 	E.g. https://github.com/llvm/llvm-project/blob/92ab024f81e5b64e258b7c3baaf213c7c26fcf40/compiler-rt/lib/builtins/floatdidf.c#L28
	// 	E.g. https://opensource.apple.com/source/clang/clang-425.0.24/src/projects/compiler-rt/lib/floatdidf.c.auto.html
	twop52 = [16]byte{
		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, // float64(0x1.0p52)
	}
)

func ( *machine) (,  ssa.Value,  ssa.VecLane,  bool) {
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .copyToTmp(.reg())

	switch  {
	case ssa.VecLaneF32x4:
		if  {
			 := .copyToTmp()

			// Assuming we have xx = [v1, v2, v3, v4].
			//
			// Set all bits if lane is not NaN on tmp.
			// tmp[i] = 0xffffffff  if vi != NaN
			//        = 0           if vi == NaN
			.insert(.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredEQ_OQ), newOperandReg(), ))

			// Clear NaN lanes on xx, meaning that
			// 	xx[i] = vi  if vi != NaN
			//	        0   if vi == NaN
			.insert(.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(), ))

			// tmp[i] = ^vi         if vi != NaN
			//        = 0xffffffff  if vi == NaN
			// which means that tmp[i] & 0x80000000 != 0 if and only if vi is negative.
			.insert(.allocateInstr().asXmmRmR(sseOpcodeXorps, newOperandReg(), ))

			// xx[i] = int32(vi)   if vi != NaN and xx is not overflowing.
			//       = 0x80000000  if vi != NaN and xx is overflowing (See https://www.felixcloutier.com/x86/cvttps2dq)
			//       = 0           if vi == NaN
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(), ))

			// Below, we have to convert 0x80000000 into 0x7FFFFFFF for positive overflowing lane.
			//
			// tmp[i] = 0x80000000                         if vi is positive
			//        = any satisfying any&0x80000000 = 0  if vi is negative or zero.
			.insert(.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(), ))

			// Arithmetic right shifting tmp by 31, meaning that we have
			// tmp[i] = 0xffffffff if vi is positive, 0 otherwise.
			.insert(.allocateInstr().asXmmRmiReg(sseOpcodePsrad, newOperandImm32(0x1f), ))

			// Flipping 0x80000000 if vi is positive, otherwise keep intact.
			.insert(.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(), ))
		} else {
			 := .c.AllocateVReg(ssa.TypeV128)
			.insert(.allocateInstr().asZeros())
			.insert(.allocateInstr().asXmmRmR(sseOpcodeMaxps, newOperandReg(), ))
			.insert(.allocateInstr().asXmmRmR(sseOpcodePcmpeqd, newOperandReg(), ))
			.insert(.allocateInstr().asXmmRmiReg(sseOpcodePsrld, newOperandImm32(0x1), ))
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtdq2ps, newOperandReg(), ))
			 := .copyToTmp()
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(), ))
			.insert(.allocateInstr().asXmmRmR(sseOpcodeSubps, newOperandReg(), ))
			.insert(.allocateInstr().asXmmRmRImm(sseOpcodeCmpps, uint8(cmpPredLE_OS), newOperandReg(), ))
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttps2dq, newOperandReg(), ))
			.insert(.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(), ))
			.insert(.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(), ))
			.insert(.allocateInstr().asXmmRmR(sseOpcodePmaxsd, newOperandReg(), ))
			.insert(.allocateInstr().asXmmRmR(sseOpcodePaddd, newOperandReg(), ))
		}

	case ssa.VecLaneF64x2:
		 := .c.AllocateVReg(ssa.TypeV128)
		if  {
			 := .copyToTmp()

			// Set all bits for non-NaN lanes, zeros otherwise.
			// I.e. tmp[i] = 0xffffffff_ffffffff if vi != NaN, 0 otherwise.
			.insert(.allocateInstr().asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_OQ), newOperandReg(), ))

			 := .getOrAllocateConstLabel(&.constI32sMaxOnF64x2Index, i32sMaxOnF64x2[:])
			// Load the 2147483647 into tmp2's each lane.
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), ))

			// tmp[i] = 2147483647 if vi != NaN, 0 otherwise.
			.insert(.allocateInstr().asXmmRmR(sseOpcodeAndps, newOperandReg(), ))

			// MINPD returns the source register's value as-is, so we have
			//  xx[i] = vi   if vi != NaN
			//        = 0    if vi == NaN
			.insert(.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(), ))

			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeCvttpd2dq, newOperandReg(), ))
		} else {
			 := .c.AllocateVReg(ssa.TypeV128)
			.insert(.allocateInstr().asZeros())

			//  xx[i] = vi   if vi != NaN && vi > 0
			//        = 0    if vi == NaN || vi <= 0
			.insert(.allocateInstr().asXmmRmR(sseOpcodeMaxpd, newOperandReg(), ))

			// tmp2[i] = float64(math.MaxUint32) = math.MaxUint32
			 := .getOrAllocateConstLabel(&.constI32uMaxOnF64x2Index, i32uMaxOnF64x2[:])
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), ))

			// xx[i] = vi   if vi != NaN && vi > 0 && vi <= math.MaxUint32
			//       = 0    otherwise
			.insert(.allocateInstr().asXmmRmR(sseOpcodeMinpd, newOperandReg(), ))

			// Round the floating points into integer.
			.insert(.allocateInstr().asXmmRmRImm(sseOpcodeRoundpd, 0x3, newOperandReg(), ))

			// tmp2[i] = float64(0x1.0p52)
			 = .getOrAllocateConstLabel(&.constTwop52Index, twop52[:])
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), ))

			// xx[i] = float64(0x1.0p52) + float64(uint32(vi)) if vi != NaN && vi > 0 && vi <= math.MaxUint32
			//       = 0                                       otherwise
			//
			// This means that xx[i] holds exactly the same bit of uint32(vi) in its lower 32-bits.
			.insert(.allocateInstr().asXmmRmR(sseOpcodeAddpd, newOperandReg(), ))

			// At this point, we have
			// 	xx  = [uint32(v0), float64(0x1.0p52), uint32(v1), float64(0x1.0p52)]
			//  tmp = [0, 0, 0, 0]
			// as 32x4 lanes. Therefore, SHUFPS with 0b00_00_10_00 results in
			//	xx = [xx[00], xx[10], tmp[00], tmp[00]] = [xx[00], xx[10], 0, 0]
			// meaning that for i = 0 and 1, we have
			//  xx[i] = uint32(vi) if vi != NaN && vi > 0 && vi <= math.MaxUint32
			//        = 0          otherwise.
			.insert(.allocateInstr().asXmmRmRImm(sseOpcodeShufps, 0b00_00_10_00, newOperandReg(), ))
		}
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}

	.copyTo(, .c.VRegOf())
}

func ( *machine) (, ,  ssa.Value,  ssa.VecLane,  bool) {
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .copyToTmp(.reg())
	 := .getOperand_Mem_Reg(.c.ValueDefinition())

	var  sseOpcode
	switch  {
	case ssa.VecLaneI16x8:
		if  {
			 = sseOpcodePacksswb
		} else {
			 = sseOpcodePackuswb
		}
	case ssa.VecLaneI32x4:
		if  {
			 = sseOpcodePackssdw
		} else {
			 = sseOpcodePackusdw
		}
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}
	.insert(.allocateInstr().asXmmRmR(, , ))
	.copyTo(, .c.VRegOf())
}

func ( *machine) (, ,  ssa.Value) {
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .copyToTmp(.reg())
	 := .getOperand_Mem_Reg(.c.ValueDefinition())
	.insert(.allocateInstr().asXmmRmR(sseOpcodePmaddwd, , ))
	.copyTo(, .c.VRegOf())
}

func ( *machine) ( *ssa.Instruction) {
	,  := .ArgWithLane()
	 := .c.VRegOf(.Return())

	if  == ssa.VecLaneI64x2 {
		 := .getOperand_Reg(.c.ValueDefinition())

		 := xmm0VReg
		.insert(.allocateInstr().asDefineUninitializedReg())

		 := .copyToTmp(.reg())
		 := .copyToTmp(.reg())

		// Clear all bits on blendReg.
		.insert(.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(), ))
		// Subtract xx from blendMaskReg.
		.insert(.allocateInstr().asXmmRmR(sseOpcodePsubq, newOperandReg(), ))
		// Copy the subtracted value ^^ back into tmp.
		.copyTo(, )

		.insert(.allocateInstr().asBlendvpd(newOperandReg(), ))

		.copyTo(, )
	} else {
		var  sseOpcode
		switch  {
		case ssa.VecLaneI8x16:
			 = sseOpcodePabsb
		case ssa.VecLaneI16x8:
			 = sseOpcodePabsw
		case ssa.VecLaneI32x4:
			 = sseOpcodePabsd
		}
		 := .getOperand_Reg(.c.ValueDefinition())

		 := .allocateInstr()
		.asXmmUnaryRmR(, , )
		.insert()
	}
}

func ( *machine) ( *ssa.Instruction) {
	 := .Arg()
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .c.VRegOf(.Return())

	 := .c.AllocateVReg(ssa.TypeV128)
	.lowerVconst(, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f)

	// Copy input into tmp2.
	 := .copyToTmp(.reg())

	// Given that we have:
	//  rm = [b1, ..., b16] where bn = hn:ln and hn and ln are higher and lower 4-bits of bn.
	//
	// Take PAND on tmp1 and tmp2, so that we mask out all the higher bits.
	//  tmp2 = [l1, ..., l16].
	 := .allocateInstr()
	.asXmmRmR(sseOpcodePand, newOperandReg(), )
	.insert()

	// Do logical (packed word) right shift by 4 on rm and PAND against the mask (tmp1); meaning that we have
	//  tmp3 = [h1, ...., h16].
	 := .copyToTmp(.reg())
	 := .allocateInstr()
	.asXmmRmiReg(sseOpcodePsrlw, newOperandImm32(4), )
	.insert()

	 := .allocateInstr()
	.asXmmRmR(sseOpcodePand, newOperandReg(), )
	.insert()

	// Read the popcntTable into tmp4, and we have
	//  tmp4 = [0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04]
	 := .c.AllocateVReg(ssa.TypeV128)
	.lowerVconst(, 0x03_02_02_01_02_01_01_00, 0x04_03_03_02_03_02_02_01)

	// Make a copy for later.
	 := .copyToTmp()

	//  tmp4 = [popcnt(l1), ..., popcnt(l16)].
	 := .allocateInstr()
	.asXmmRmR(sseOpcodePshufb, newOperandReg(), )
	.insert()

	 := .allocateInstr()
	.asXmmRmR(sseOpcodePshufb, newOperandReg(), )
	.insert()

	// tmp4 + tmp5 is the result.
	 := .allocateInstr()
	.asXmmRmR(sseOpcodePaddb, newOperandReg(), )
	.insert()

	.copyTo(, )
}

func ( *machine) ( *ssa.Instruction) {
	, ,  := .Arg2WithLane()
	 := .c.VRegOf(.Return())
	if  == ssa.VecLaneI64x2 {
		 := .getOperand_Reg(.c.ValueDefinition())
		 := .getOperand_Reg(.c.ValueDefinition())
		// Assuming that we have
		//	rm = [p1, p2] = [p1_lo, p1_hi, p2_lo, p2_high]
		//  rn = [q1, q2] = [q1_lo, q1_hi, q2_lo, q2_high]
		// where pN and qN are 64-bit (quad word) lane, and pN_lo, pN_hi, qN_lo and qN_hi are 32-bit (double word) lane.

		// Copy rn into tmp1.
		 := .copyToTmp(.reg())

		// And do the logical right shift by 32-bit on tmp1, which makes tmp1 = [0, p1_high, 0, p2_high]
		 := .allocateInstr()
		.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), )
		.insert()

		// Execute "pmuludq rm,tmp1", which makes tmp1 = [p1_high*q1_lo, p2_high*q2_lo] where each lane is 64-bit.
		 := .allocateInstr()
		.asXmmRmR(sseOpcodePmuludq, , )
		.insert()

		// Copy rm value into tmp2.
		 := .copyToTmp(.reg())

		// And do the logical right shift by 32-bit on tmp2, which makes tmp2 = [0, q1_high, 0, q2_high]
		 := .allocateInstr()
		.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(32), )
		.insert()

		// Execute "pmuludq rm,tmp2", which makes tmp2 = [p1_lo*q1_high, p2_lo*q2_high] where each lane is 64-bit.
		 := .allocateInstr()
		.asXmmRmR(sseOpcodePmuludq, , )
		.insert()

		// Adds tmp1 and tmp2 and do the logical left shift by 32-bit,
		// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32, (p2_lo*q2_high+p2_high*q2_lo)<<32]
		 := .allocateInstr()
		.asXmmRmR(sseOpcodePaddq, newOperandReg(), )
		.insert()

		 := .allocateInstr()
		.asXmmRmiReg(sseOpcodePsllq, newOperandImm32(32), )
		.insert()

		// Copy rm value into tmp3.
		 := .copyToTmp(.reg())

		// "pmuludq rm,tmp3" makes tmp3 = [p1_lo*q1_lo, p2_lo*q2_lo] where each lane is 64-bit.
		 := .allocateInstr()
		.asXmmRmR(sseOpcodePmuludq, , )
		.insert()

		// Finally, we get the result by computing tmp1 + tmp3,
		// which makes tmp1 = [(p1_lo*q1_high+p1_high*q1_lo)<<32+p1_lo*q1_lo, (p2_lo*q2_high+p2_high*q2_lo)<<32+p2_lo*q2_lo]
		 := .allocateInstr()
		.asXmmRmR(sseOpcodePaddq, newOperandReg(), )
		.insert()

		.copyTo(, )

	} else {
		var  sseOpcode
		switch  {
		case ssa.VecLaneI16x8:
			 = sseOpcodePmullw
		case ssa.VecLaneI32x4:
			 = sseOpcodePmulld
		default:
			panic("unsupported: " + .String())
		}
		.lowerVbBinOp(, , , .Return())
	}
}