package amd64

import (
	
	
	
	
	

	
	
	
	
	
)

// NewBackend returns a new backend for arm64.
func () backend.Machine {
	 := &machine{
		cpuFeatures:                         platform.CpuFeatures,
		regAlloc:                            regalloc.NewAllocator[*instruction, *labelPosition, *regAllocFn](regInfo),
		spillSlots:                          map[regalloc.VRegID]int64{},
		amodePool:                           wazevoapi.NewPool[amode](nil),
		labelPositionPool:                   wazevoapi.NewIDedPool[labelPosition](resetLabelPosition),
		instrPool:                           wazevoapi.NewPool[instruction](resetInstruction),
		constSwizzleMaskConstIndex:          -1,
		constSqmulRoundSatIndex:             -1,
		constI8x16SHLMaskTableIndex:         -1,
		constI8x16LogicalSHRMaskTableIndex:  -1,
		constF64x2CvtFromIMaskIndex:         -1,
		constTwop52Index:                    -1,
		constI32sMaxOnF64x2Index:            -1,
		constI32uMaxOnF64x2Index:            -1,
		constAllOnesI8x16Index:              -1,
		constAllOnesI16x8Index:              -1,
		constExtAddPairwiseI16x8uMask1Index: -1,
		constExtAddPairwiseI16x8uMask2Index: -1,
	}
	.regAllocFn.m = 
	return 
}

type (
	// machine implements backend.Machine for amd64.
	machine struct {
		c                        backend.Compiler
		stackBoundsCheckDisabled bool

		instrPool wazevoapi.Pool[instruction]
		amodePool wazevoapi.Pool[amode]

		cpuFeatures platform.CpuFeatureFlags

		regAlloc        regalloc.Allocator[*instruction, *labelPosition, *regAllocFn]
		regAllocFn      regAllocFn
		regAllocStarted bool

		// labelPositionPool is the pool of labelPosition. The id is the label where
		// if the label is less than the maxSSABlockID, it's the ssa.BasicBlockID.
		labelPositionPool wazevoapi.IDedPool[labelPosition]
		// nextLabel is the next label to be allocated. The first free label comes after maxSSABlockID
		// so that we can have an identical label for the SSA block ID, which is useful for debugging.
		nextLabel label
		// rootInstr is the first instruction of the function.
		rootInstr *instruction
		// currentLabelPos is the currently-compiled ssa.BasicBlock's labelPosition.
		currentLabelPos *labelPosition
		// orderedSSABlockLabelPos is the ordered list of labelPosition in the generated code for each ssa.BasicBlock.
		orderedSSABlockLabelPos []*labelPosition
		// returnLabelPos is the labelPosition for the return block.
		returnLabelPos labelPosition
		// perBlockHead and perBlockEnd are the head and tail of the instruction list per currently-compiled ssa.BasicBlock.
		perBlockHead, perBlockEnd *instruction
		// pendingInstructions are the instructions which are not yet emitted into the instruction list.
		pendingInstructions []*instruction
		// maxSSABlockID is the maximum ssa.BasicBlockID in the current function.
		maxSSABlockID label

		spillSlotSize int64
		spillSlots    map[regalloc.VRegID]int64
		currentABI    *backend.FunctionABI
		clobberedRegs []regalloc.VReg

		maxRequiredStackSizeForCalls int64

		labelResolutionPends []labelResolutionPend

		// jmpTableTargets holds the labels of the jump table targets.
		jmpTableTargets [][]uint32
		// jmpTableTargetNext is the index to the jmpTableTargets slice to be used for the next jump table.
		jmpTableTargetsNext int
		consts              []_const

		constSwizzleMaskConstIndex, constSqmulRoundSatIndex,
		constI8x16SHLMaskTableIndex, constI8x16LogicalSHRMaskTableIndex,
		constF64x2CvtFromIMaskIndex, constTwop52Index,
		constI32sMaxOnF64x2Index, constI32uMaxOnF64x2Index,
		constAllOnesI8x16Index, constAllOnesI16x8Index,
		constExtAddPairwiseI16x8uMask1Index, constExtAddPairwiseI16x8uMask2Index int
	}

	_const struct {
		lo, hi   uint64
		_var     []byte
		label    label
		labelPos *labelPosition
	}

	labelResolutionPend struct {
		instr       *instruction
		instrOffset int64
		// imm32Offset is the offset of the last 4 bytes of the instruction.
		imm32Offset int64
	}
)

type (
	// label represents a position in the generated code which is either
	// a real instruction or the constant InstructionPool (e.g. jump tables).
	//
	// This is exactly the same as the traditional "label" in assembly code.
	label uint32

	// labelPosition represents the regions of the generated code which the label represents.
	// This implements regalloc.Block.
	labelPosition struct {
		// sb is not nil if this corresponds to a ssa.BasicBlock.
		sb ssa.BasicBlock
		// cur is used to walk through the instructions in the block during the register allocation.
		cur,
		// begin and end are the first and last instructions of the block.
		begin, end *instruction
		// binaryOffset is the offset in the binary where the label is located.
		binaryOffset int64
	}
)

// String implements backend.Machine.
func ( label) () string {
	return fmt.Sprintf("L%d", )
}

func resetLabelPosition( *labelPosition) {
	* = labelPosition{}
}

const labelReturn = math.MaxUint32

func ssaBlockLabel( ssa.BasicBlock) label {
	if .ReturnBlock() {
		return labelReturn
	}
	return label(.ID())
}

// getOrAllocateSSABlockLabelPosition returns the labelPosition for the given basic block.
func ( *machine) ( ssa.BasicBlock) *labelPosition {
	if .ReturnBlock() {
		.returnLabelPos.sb = 
		return &.returnLabelPos
	}

	 := ssaBlockLabel()
	 := .labelPositionPool.GetOrAllocate(int())
	.sb = 
	return 
}

func ( *machine) ( *int,  []byte) label {
	 := *
	if  == -1 {
		,  := .allocateLabel()
		 = len(.consts)
		.consts = append(.consts, _const{
			_var:     ,
			label:    ,
			labelPos: ,
		})
		* = 
	}
	return .consts[].label
}

// Reset implements backend.Machine.
func ( *machine) () {
	.consts = .consts[:0]
	.clobberedRegs = .clobberedRegs[:0]
	for  := range .spillSlots {
		.clobberedRegs = append(.clobberedRegs, regalloc.VReg())
	}
	for ,  := range .clobberedRegs {
		delete(.spillSlots, regalloc.VRegID())
	}

	.stackBoundsCheckDisabled = false
	.regAlloc.Reset()
	.labelPositionPool.Reset()
	.instrPool.Reset()
	.regAllocStarted = false
	.clobberedRegs = .clobberedRegs[:0]

	.spillSlotSize = 0
	.maxRequiredStackSizeForCalls = 0
	.perBlockHead, .perBlockEnd, .rootInstr = nil, nil, nil
	.pendingInstructions = .pendingInstructions[:0]
	.orderedSSABlockLabelPos = .orderedSSABlockLabelPos[:0]

	.amodePool.Reset()
	.jmpTableTargetsNext = 0
	.constSwizzleMaskConstIndex = -1
	.constSqmulRoundSatIndex = -1
	.constI8x16SHLMaskTableIndex = -1
	.constI8x16LogicalSHRMaskTableIndex = -1
	.constF64x2CvtFromIMaskIndex = -1
	.constTwop52Index = -1
	.constI32sMaxOnF64x2Index = -1
	.constI32uMaxOnF64x2Index = -1
	.constAllOnesI8x16Index = -1
	.constAllOnesI16x8Index = -1
	.constExtAddPairwiseI16x8uMask1Index = -1
	.constExtAddPairwiseI16x8uMask2Index = -1
}

// StartLoweringFunction implements backend.Machine StartLoweringFunction.
func ( *machine) ( ssa.BasicBlockID) {
	.maxSSABlockID = label()
	.nextLabel = label() + 1
}

// LinkAdjacentBlocks implements backend.Machine.
func ( *machine) (,  ssa.BasicBlock) {
	,  := .getOrAllocateSSABlockLabelPosition(), .getOrAllocateSSABlockLabelPosition()
	.end.next = .begin
}

// StartBlock implements backend.Machine.
func ( *machine) ( ssa.BasicBlock) {
	.currentLabelPos = .getOrAllocateSSABlockLabelPosition()
	 := .currentLabelPos
	 := .allocateNop()
	.perBlockHead, .perBlockEnd = , 
	.begin, .end = , 
	.orderedSSABlockLabelPos = append(.orderedSSABlockLabelPos, )
}

// EndBlock implements ExecutableContext.
func ( *machine) () {
	// Insert nop0 as the head of the block for convenience to simplify the logic of inserting instructions.
	.insertAtPerBlockHead(.allocateNop())

	.currentLabelPos.begin = .perBlockHead

	if .currentLabelPos.sb.EntryBlock() {
		.rootInstr = .perBlockHead
	}
}

func ( *machine) ( *instruction) {
	if .perBlockHead == nil {
		.perBlockHead = 
		.perBlockEnd = 
		return
	}

	.next = .perBlockHead
	.perBlockHead.prev = 
	.perBlockHead = 
}

// FlushPendingInstructions implements backend.Machine.
func ( *machine) () {
	 := len(.pendingInstructions)
	if  == 0 {
		return
	}
	for  :=  - 1;  >= 0; -- { // reverse because we lower instructions in reverse order.
		.insertAtPerBlockHead(.pendingInstructions[])
	}
	.pendingInstructions = .pendingInstructions[:0]
}

// DisableStackCheck implements backend.Machine.
func ( *machine) () { .stackBoundsCheckDisabled = true }

// SetCompiler implements backend.Machine.
func ( *machine) ( backend.Compiler) {
	.c = 
	.regAllocFn.ssaB = .SSABuilder()
}

// SetCurrentABI implements backend.Machine.
func ( *machine) ( *backend.FunctionABI) { .currentABI =  }

// RegAlloc implements backend.Machine.
func ( *machine) () {
	 := .regAllocFn
	.regAllocStarted = true
	.regAlloc.DoAllocation(&)
	// Now that we know the final spill slot size, we must align spillSlotSize to 16 bytes.
	.spillSlotSize = (.spillSlotSize + 15) &^ 15
}

// InsertReturn implements backend.Machine.
func ( *machine) () {
	 := .allocateInstr().asRet()
	.insert()
}

// LowerSingleBranch implements backend.Machine.
func ( *machine) ( *ssa.Instruction) {
	switch .Opcode() {
	case ssa.OpcodeJump:
		, ,  := .BranchData()
		if .IsFallthroughJump() {
			return
		}
		 := .allocateInstr()
		 := ssaBlockLabel(.c.SSABuilder().BasicBlock())
		if  == labelReturn {
			.asRet()
		} else {
			.asJmp(newOperandLabel())
		}
		.insert()
	case ssa.OpcodeBrTable:
		,  := .BrTableData()
		.lowerBrTable(, )
	default:
		panic("BUG: unexpected branch opcode" + .Opcode().String())
	}
}

func ( *machine) ( ssa.Values) ( int) {
	if .jmpTableTargetsNext == len(.jmpTableTargets) {
		.jmpTableTargets = append(.jmpTableTargets, make([]uint32, 0, len(.View())))
	}

	 = .jmpTableTargetsNext
	.jmpTableTargetsNext++
	.jmpTableTargets[] = .jmpTableTargets[][:0]
	for ,  := range .View() {
		 := .c.SSABuilder().BasicBlock(ssa.BasicBlockID())
		.jmpTableTargets[] = append(.jmpTableTargets[], uint32(ssaBlockLabel()))
	}
	return
}

var condBranchMatches = [...]ssa.Opcode{ssa.OpcodeIcmp, ssa.OpcodeFcmp}

func ( *machine) ( ssa.Value,  ssa.Values) {
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .copyToTmp(.reg())

	 := len(.View())

	// First, we need to do the bounds check.
	 := .c.AllocateVReg(ssa.TypeI32)
	.lowerIconst(, uint64(-1), false)
	 := .allocateInstr().asCmpRmiR(true, newOperandReg(), , false)
	.insert()

	// Then do the conditional move maxIndex to v if v > maxIndex.
	 := .allocateInstr().asCmove(condNB, newOperandReg(), , false)
	.insert()

	// Now that v has the correct index. Load the address of the jump table into the addr.
	 := .c.AllocateVReg(ssa.TypeI64)
	 := .allocateInstr()
	.insert()

	// Then add the target's offset into jmpTableAddr.
	 := .allocateInstr().asAluRmiR(aluRmiROpcodeAdd,
		// Shift by 3 because each entry is 8 bytes.
		newOperandMem(.newAmodeRegRegShift(0, , , 3)), , true)
	.insert()

	// Now ready to jump.
	 := .allocateInstr().asJmp(newOperandReg())
	.insert()

	,  := .allocateBrTarget()
	.insert()
	.asLEA(newOperandLabel(), )

	 := .allocateInstr()
	 := .addJmpTableTarget()
	.asJmpTableSequence(, )
	.insert()
}

// LowerConditionalBranch implements backend.Machine.
func ( *machine) ( *ssa.Instruction) {
	, ,  := .BranchData()
	if len() > 0 {
		panic(fmt.Sprintf(
			"conditional branch shouldn't have args; likely a bug in critical edge splitting: from %s to %s",
			.currentLabelPos.sb,
			,
		))
	}

	 := ssaBlockLabel(.c.SSABuilder().BasicBlock())
	 := .c.ValueDefinition()

	switch .c.MatchInstrOneOf(, condBranchMatches[:]) {
	case ssa.OpcodeIcmp:
		 := .Instr
		, ,  := .IcmpData()

		 := condFromSSAIntCmpCond()
		if .Opcode() == ssa.OpcodeBrz {
			 = .invert()
		}

		// First, perform the comparison and set the flag.
		,  := .c.ValueDefinition(), .c.ValueDefinition()
		if !.tryLowerBandToFlag(, ) {
			.lowerIcmpToFlag(, , .Type() == ssa.TypeI64)
		}

		// Then perform the conditional branch.
		.insert(.allocateInstr().asJmpIf(, newOperandLabel()))
		.Instr.MarkLowered()
	case ssa.OpcodeFcmp:
		 := .Instr

		, ,  := .lowerFcmpToFlags()
		 := .Opcode() == ssa.OpcodeBrz
		if  {
			 = .invert()
		}
		if  == condInvalid {
			.insert(.allocateInstr().asJmpIf(, newOperandLabel()))
		} else {
			if  {
				 = .invert()
				 = !
			}
			,  := .allocateInstr(), .allocateInstr()
			.insert()
			.insert()
			,  := .allocateBrTarget()
			.insert()
			if  {
				.asJmpIf(.invert(), newOperandLabel())
				.asJmpIf(, newOperandLabel())
			} else {
				.asJmpIf(, newOperandLabel())
				.asJmpIf(, newOperandLabel())
			}
		}

		.Instr.MarkLowered()
	default:
		 := .getOperand_Reg()

		var  cond
		if .Opcode() == ssa.OpcodeBrz {
			 = condZ
		} else {
			 = condNZ
		}

		// Perform test %v, %v to set the flag.
		 := .allocateInstr().asCmpRmiR(false, , .reg(), false)
		.insert()
		.insert(.allocateInstr().asJmpIf(, newOperandLabel()))
	}
}

// LowerInstr implements backend.Machine.
func ( *machine) ( *ssa.Instruction) {
	if  := .SourceOffset(); .Valid() {
		 := .allocateInstr().asEmitSourceOffsetInfo()
		.insert()
	}

	switch  := .Opcode();  {
	case ssa.OpcodeBrz, ssa.OpcodeBrnz, ssa.OpcodeJump, ssa.OpcodeBrTable:
		panic("BUG: branching instructions are handled by LowerBranches")
	case ssa.OpcodeReturn:
		panic("BUG: return must be handled by backend.Compiler")
	case ssa.OpcodeIconst, ssa.OpcodeF32const, ssa.OpcodeF64const: // Constant instructions are inlined.
	case ssa.OpcodeCall, ssa.OpcodeCallIndirect:
		.lowerCall()
	case ssa.OpcodeStore, ssa.OpcodeIstore8, ssa.OpcodeIstore16, ssa.OpcodeIstore32:
		.lowerStore()
	case ssa.OpcodeIadd:
		.lowerAluRmiROp(, aluRmiROpcodeAdd)
	case ssa.OpcodeIsub:
		.lowerAluRmiROp(, aluRmiROpcodeSub)
	case ssa.OpcodeImul:
		.lowerAluRmiROp(, aluRmiROpcodeMul)
	case ssa.OpcodeSdiv, ssa.OpcodeUdiv, ssa.OpcodeSrem, ssa.OpcodeUrem:
		 :=  == ssa.OpcodeSdiv ||  == ssa.OpcodeUdiv
		 :=  == ssa.OpcodeSdiv ||  == ssa.OpcodeSrem
		.lowerIDivRem(, , )
	case ssa.OpcodeBand:
		.lowerAluRmiROp(, aluRmiROpcodeAnd)
	case ssa.OpcodeBor:
		.lowerAluRmiROp(, aluRmiROpcodeOr)
	case ssa.OpcodeBxor:
		.lowerAluRmiROp(, aluRmiROpcodeXor)
	case ssa.OpcodeIshl:
		.lowerShiftR(, shiftROpShiftLeft)
	case ssa.OpcodeSshr:
		.lowerShiftR(, shiftROpShiftRightArithmetic)
	case ssa.OpcodeUshr:
		.lowerShiftR(, shiftROpShiftRightLogical)
	case ssa.OpcodeRotl:
		.lowerShiftR(, shiftROpRotateLeft)
	case ssa.OpcodeRotr:
		.lowerShiftR(, shiftROpRotateRight)
	case ssa.OpcodeClz:
		.lowerClz()
	case ssa.OpcodeCtz:
		.lowerCtz()
	case ssa.OpcodePopcnt:
		.lowerUnaryRmR(, unaryRmROpcodePopcnt)
	case ssa.OpcodeFadd, ssa.OpcodeFsub, ssa.OpcodeFmul, ssa.OpcodeFdiv:
		.lowerXmmRmR()
	case ssa.OpcodeFabs:
		.lowerFabsFneg()
	case ssa.OpcodeFneg:
		.lowerFabsFneg()
	case ssa.OpcodeCeil:
		.lowerRound(, roundingModeUp)
	case ssa.OpcodeFloor:
		.lowerRound(, roundingModeDown)
	case ssa.OpcodeTrunc:
		.lowerRound(, roundingModeZero)
	case ssa.OpcodeNearest:
		.lowerRound(, roundingModeNearest)
	case ssa.OpcodeFmin, ssa.OpcodeFmax:
		.lowerFminFmax()
	case ssa.OpcodeFcopysign:
		.lowerFcopysign()
	case ssa.OpcodeBitcast:
		.lowerBitcast()
	case ssa.OpcodeSqrt:
		.lowerSqrt()
	case ssa.OpcodeFpromote:
		 := .Arg()
		 := .getOperand_Reg(.c.ValueDefinition())
		 := .c.VRegOf(.Return())
		 := .allocateInstr()
		.asXmmUnaryRmR(sseOpcodeCvtss2sd, , )
		.insert()
	case ssa.OpcodeFdemote:
		 := .Arg()
		 := .getOperand_Reg(.c.ValueDefinition())
		 := .c.VRegOf(.Return())
		 := .allocateInstr()
		.asXmmUnaryRmR(sseOpcodeCvtsd2ss, , )
		.insert()
	case ssa.OpcodeFcvtToSint, ssa.OpcodeFcvtToSintSat:
		,  := .Arg2()
		 := .getOperand_Reg(.c.ValueDefinition())
		 := .c.VRegOf(.Return())
		 := .c.VRegOf()
		.lowerFcvtToSint(, .reg(), , .Type() == ssa.TypeF64,
			.Return().Type().Bits() == 64,  == ssa.OpcodeFcvtToSintSat)
	case ssa.OpcodeFcvtToUint, ssa.OpcodeFcvtToUintSat:
		,  := .Arg2()
		 := .getOperand_Reg(.c.ValueDefinition())
		 := .c.VRegOf(.Return())
		 := .c.VRegOf()
		.lowerFcvtToUint(, .reg(), , .Type() == ssa.TypeF64,
			.Return().Type().Bits() == 64,  == ssa.OpcodeFcvtToUintSat)
	case ssa.OpcodeFcvtFromSint:
		 := .Arg()
		 := .getOperand_Reg(.c.ValueDefinition())
		 := newOperandReg(.c.VRegOf(.Return()))
		.lowerFcvtFromSint(, ,
			.Type() == ssa.TypeI64, .Return().Type().Bits() == 64)
	case ssa.OpcodeFcvtFromUint:
		 := .Arg()
		 := .getOperand_Reg(.c.ValueDefinition())
		 := newOperandReg(.c.VRegOf(.Return()))
		.lowerFcvtFromUint(, , .Type() == ssa.TypeI64,
			.Return().Type().Bits() == 64)
	case ssa.OpcodeVanyTrue:
		.lowerVanyTrue()
	case ssa.OpcodeVallTrue:
		.lowerVallTrue()
	case ssa.OpcodeVhighBits:
		.lowerVhighBits()
	case ssa.OpcodeVbnot:
		.lowerVbnot()
	case ssa.OpcodeVband:
		,  := .Arg2()
		.lowerVbBinOp(sseOpcodePand, , , .Return())
	case ssa.OpcodeVbor:
		,  := .Arg2()
		.lowerVbBinOp(sseOpcodePor, , , .Return())
	case ssa.OpcodeVbxor:
		,  := .Arg2()
		.lowerVbBinOp(sseOpcodePxor, , , .Return())
	case ssa.OpcodeVbandnot:
		.lowerVbandnot(, sseOpcodePandn)
	case ssa.OpcodeVbitselect:
		.lowerVbitselect()
	case ssa.OpcodeVIadd:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneI8x16:
			 = sseOpcodePaddb
		case ssa.VecLaneI16x8:
			 = sseOpcodePaddw
		case ssa.VecLaneI32x4:
			 = sseOpcodePaddd
		case ssa.VecLaneI64x2:
			 = sseOpcodePaddq
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVSaddSat:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneI8x16:
			 = sseOpcodePaddsb
		case ssa.VecLaneI16x8:
			 = sseOpcodePaddsw
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVUaddSat:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneI8x16:
			 = sseOpcodePaddusb
		case ssa.VecLaneI16x8:
			 = sseOpcodePaddusw
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVIsub:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneI8x16:
			 = sseOpcodePsubb
		case ssa.VecLaneI16x8:
			 = sseOpcodePsubw
		case ssa.VecLaneI32x4:
			 = sseOpcodePsubd
		case ssa.VecLaneI64x2:
			 = sseOpcodePsubq
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVSsubSat:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneI8x16:
			 = sseOpcodePsubsb
		case ssa.VecLaneI16x8:
			 = sseOpcodePsubsw
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVUsubSat:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneI8x16:
			 = sseOpcodePsubusb
		case ssa.VecLaneI16x8:
			 = sseOpcodePsubusw
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVImul:
		.lowerVImul()
	case ssa.OpcodeVIneg:
		,  := .ArgWithLane()
		 := .getOperand_Reg(.c.ValueDefinition())
		 := .c.VRegOf(.Return())
		var  sseOpcode
		switch  {
		case ssa.VecLaneI8x16:
			 = sseOpcodePsubb
		case ssa.VecLaneI16x8:
			 = sseOpcodePsubw
		case ssa.VecLaneI32x4:
			 = sseOpcodePsubd
		case ssa.VecLaneI64x2:
			 = sseOpcodePsubq
		default:
			panic("BUG")
		}

		 := .c.AllocateVReg(ssa.TypeV128)
		.insert(.allocateInstr().asZeros())

		 := .allocateInstr()
		.asXmmRmR(, , )
		.insert()

		.copyTo(, )
	case ssa.OpcodeVFadd:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneF32x4:
			 = sseOpcodeAddps
		case ssa.VecLaneF64x2:
			 = sseOpcodeAddpd
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVFsub:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneF32x4:
			 = sseOpcodeSubps
		case ssa.VecLaneF64x2:
			 = sseOpcodeSubpd
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVFdiv:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneF32x4:
			 = sseOpcodeDivps
		case ssa.VecLaneF64x2:
			 = sseOpcodeDivpd
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVFmul:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneF32x4:
			 = sseOpcodeMulps
		case ssa.VecLaneF64x2:
			 = sseOpcodeMulpd
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVFneg:
		,  := .ArgWithLane()
		 := .getOperand_Reg(.c.ValueDefinition())
		 := .c.VRegOf(.Return())

		 := .c.AllocateVReg(ssa.TypeV128)

		var ,  sseOpcode
		var  uint32
		switch  {
		case ssa.VecLaneF32x4:
			, ,  = sseOpcodePslld, 31, sseOpcodeXorps
		case ssa.VecLaneF64x2:
			, ,  = sseOpcodePsllq, 63, sseOpcodeXorpd
		}

		 := .allocateInstr()
		.asZeros()
		.insert()

		// Set all bits on tmp by CMPPD with arg=0 (== pseudo CMPEQPD instruction).
		// See https://www.felixcloutier.com/x86/cmpps
		//
		// Note: if we do not clear all the bits ^ with XORPS, this might end up not setting ones on some lane
		// if the lane is NaN.
		 := .allocateInstr()
		.asXmmRmRImm(sseOpcodeCmppd, uint8(cmpPredEQ_UQ), newOperandReg(), )
		.insert()

		// Do the left shift on each lane to set only the most significant bit in each.
		 := .allocateInstr()
		.asXmmRmiReg(, newOperandImm32(), )
		.insert()

		// Get the negated result by XOR on each lane with tmp.
		 = .allocateInstr()
		.asXmmRmR(, , )
		.insert()

		.copyTo(, )

	case ssa.OpcodeVSqrt:
		,  := .ArgWithLane()
		 := .getOperand_Reg(.c.ValueDefinition())
		 := .c.VRegOf(.Return())

		var  sseOpcode
		switch  {
		case ssa.VecLaneF32x4:
			 = sseOpcodeSqrtps
		case ssa.VecLaneF64x2:
			 = sseOpcodeSqrtpd
		}
		 := .allocateInstr()
		.asXmmUnaryRmR(, , )
		.insert()

	case ssa.OpcodeVImin:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneI8x16:
			 = sseOpcodePminsb
		case ssa.VecLaneI16x8:
			 = sseOpcodePminsw
		case ssa.VecLaneI32x4:
			 = sseOpcodePminsd
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVUmin:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneI8x16:
			 = sseOpcodePminub
		case ssa.VecLaneI16x8:
			 = sseOpcodePminuw
		case ssa.VecLaneI32x4:
			 = sseOpcodePminud
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVImax:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneI8x16:
			 = sseOpcodePmaxsb
		case ssa.VecLaneI16x8:
			 = sseOpcodePmaxsw
		case ssa.VecLaneI32x4:
			 = sseOpcodePmaxsd
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVUmax:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneI8x16:
			 = sseOpcodePmaxub
		case ssa.VecLaneI16x8:
			 = sseOpcodePmaxuw
		case ssa.VecLaneI32x4:
			 = sseOpcodePmaxud
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVAvgRound:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneI8x16:
			 = sseOpcodePavgb
		case ssa.VecLaneI16x8:
			 = sseOpcodePavgw
		}
		.lowerVbBinOp(, , , .Return())

	case ssa.OpcodeVIcmp:
		, , ,  := .VIcmpData()
		.lowerVIcmp(, , , .Return(), )

	case ssa.OpcodeVFcmp:
		, , ,  := .VFcmpData()
		.lowerVFcmp(, , , .Return(), )

	case ssa.OpcodeExtractlane:
		, , ,  := .ExtractlaneData()
		.lowerExtractLane(, , , .Return(), )

	case ssa.OpcodeInsertlane:
		, , ,  := .InsertlaneData()
		.lowerInsertLane(, , , .Return(), )

	case ssa.OpcodeSwizzle:
		, ,  := .Arg2WithLane()
		.lowerSwizzle(, , .Return())

	case ssa.OpcodeShuffle:
		, , ,  := .ShuffleData()
		.lowerShuffle(, , , , .Return())

	case ssa.OpcodeSplat:
		,  := .ArgWithLane()
		.lowerSplat(, .Return(), )

	case ssa.OpcodeSqmulRoundSat:
		,  := .Arg2()
		.lowerSqmulRoundSat(, , .Return())

	case ssa.OpcodeVZeroExtLoad:
		, ,  := .VZeroExtLoadData()
		var  sseOpcode
		// Both movss and movsd clears the higher bits of the destination register upt 128 bits.
		// https://www.felixcloutier.com/x86/movss
		// https://www.felixcloutier.com/x86/movsd
		if  == ssa.TypeF32 {
			 = sseOpcodeMovss
		} else {
			 = sseOpcodeMovsd
		}
		 := .lowerToAddressMode(, )
		 := .c.VRegOf(.Return())
		.insert(.allocateInstr().asXmmUnaryRmR(, newOperandMem(), ))

	case ssa.OpcodeVMinPseudo:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneF32x4:
			 = sseOpcodeMinps
		case ssa.VecLaneF64x2:
			 = sseOpcodeMinpd
		default:
			panic("BUG: unexpected lane type")
		}
		.lowerVbBinOpUnaligned(, , , .Return())

	case ssa.OpcodeVMaxPseudo:
		, ,  := .Arg2WithLane()
		var  sseOpcode
		switch  {
		case ssa.VecLaneF32x4:
			 = sseOpcodeMaxps
		case ssa.VecLaneF64x2:
			 = sseOpcodeMaxpd
		default:
			panic("BUG: unexpected lane type")
		}
		.lowerVbBinOpUnaligned(, , , .Return())

	case ssa.OpcodeVIshl:
		, ,  := .Arg2WithLane()
		.lowerVIshl(, , .Return(), )

	case ssa.OpcodeVSshr:
		, ,  := .Arg2WithLane()
		.lowerVSshr(, , .Return(), )

	case ssa.OpcodeVUshr:
		, ,  := .Arg2WithLane()
		.lowerVUshr(, , .Return(), )

	case ssa.OpcodeVCeil:
		,  := .ArgWithLane()
		.lowerVRound(, .Return(), 0x2,  == ssa.VecLaneF64x2)

	case ssa.OpcodeVFloor:
		,  := .ArgWithLane()
		.lowerVRound(, .Return(), 0x1,  == ssa.VecLaneF64x2)

	case ssa.OpcodeVTrunc:
		,  := .ArgWithLane()
		.lowerVRound(, .Return(), 0x3,  == ssa.VecLaneF64x2)

	case ssa.OpcodeVNearest:
		,  := .ArgWithLane()
		.lowerVRound(, .Return(), 0x0,  == ssa.VecLaneF64x2)

	case ssa.OpcodeExtIaddPairwise:
		, ,  := .ExtIaddPairwiseData()
		.lowerExtIaddPairwise(, .Return(), , )

	case ssa.OpcodeUwidenLow, ssa.OpcodeSwidenLow:
		,  := .ArgWithLane()
		.lowerWidenLow(, .Return(), ,  == ssa.OpcodeSwidenLow)

	case ssa.OpcodeUwidenHigh, ssa.OpcodeSwidenHigh:
		,  := .ArgWithLane()
		.lowerWidenHigh(, .Return(), ,  == ssa.OpcodeSwidenHigh)

	case ssa.OpcodeLoadSplat:
		, ,  := .LoadSplatData()
		.lowerLoadSplat(, , .Return(), )

	case ssa.OpcodeVFcvtFromUint, ssa.OpcodeVFcvtFromSint:
		,  := .ArgWithLane()
		.lowerVFcvtFromInt(, .Return(), ,  == ssa.OpcodeVFcvtFromSint)

	case ssa.OpcodeVFcvtToSintSat, ssa.OpcodeVFcvtToUintSat:
		,  := .ArgWithLane()
		.lowerVFcvtToIntSat(, .Return(), ,  == ssa.OpcodeVFcvtToSintSat)

	case ssa.OpcodeSnarrow, ssa.OpcodeUnarrow:
		, ,  := .Arg2WithLane()
		.lowerNarrow(, , .Return(), ,  == ssa.OpcodeSnarrow)

	case ssa.OpcodeFvpromoteLow:
		 := .Arg()
		 := .getOperand_Reg(.c.ValueDefinition())
		 := .c.VRegOf(.Return())
		.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtps2pd, , ))

	case ssa.OpcodeFvdemote:
		 := .Arg()
		 := .getOperand_Reg(.c.ValueDefinition())
		 := .c.VRegOf(.Return())
		.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeCvtpd2ps, , ))

	case ssa.OpcodeWideningPairwiseDotProductS:
		,  := .Arg2()
		.lowerWideningPairwiseDotProductS(, , .Return())

	case ssa.OpcodeVIabs:
		.lowerVIabs()
	case ssa.OpcodeVIpopcnt:
		.lowerVIpopcnt()
	case ssa.OpcodeVFmin:
		.lowerVFmin()
	case ssa.OpcodeVFmax:
		.lowerVFmax()
	case ssa.OpcodeVFabs:
		.lowerVFabs()
	case ssa.OpcodeUndefined:
		.insert(.allocateInstr().asUD2())
	case ssa.OpcodeExitWithCode:
		,  := .ExitWithCodeData()
		.lowerExitWithCode(.c.VRegOf(), )
	case ssa.OpcodeExitIfTrueWithCode:
		, ,  := .ExitIfTrueWithCodeData()
		.lowerExitIfTrueWithCode(.c.VRegOf(), , )
	case ssa.OpcodeLoad:
		, ,  := .LoadData()
		 := .c.VRegOf(.Return())
		.lowerLoad(, , , )
	case ssa.OpcodeUload8, ssa.OpcodeUload16, ssa.OpcodeUload32, ssa.OpcodeSload8, ssa.OpcodeSload16, ssa.OpcodeSload32:
		, ,  := .LoadData()
		 := .c.VRegOf(.Return())
		.lowerExtLoad(, , , )
	case ssa.OpcodeVconst:
		 := .c.VRegOf(.Return())
		,  := .VconstData()
		.lowerVconst(, , )
	case ssa.OpcodeSExtend, ssa.OpcodeUExtend:
		, ,  := .ExtendData()
		.lowerExtend(.Arg(), .Return(), , , )
	case ssa.OpcodeIcmp:
		.lowerIcmp()
	case ssa.OpcodeFcmp:
		.lowerFcmp()
	case ssa.OpcodeSelect:
		, ,  := .SelectData()
		.lowerSelect(, , , .Return())
	case ssa.OpcodeIreduce:
		 := .getOperand_Mem_Reg(.c.ValueDefinition(.Arg()))
		 := .Return()
		 := .c.VRegOf()

		if .Type() != ssa.TypeI32 {
			panic("TODO?: Ireduce to non-i32")
		}
		.insert(.allocateInstr().asMovzxRmR(extModeLQ, , ))

	case ssa.OpcodeAtomicLoad:
		 := .Arg()
		 := .AtomicTargetSize()
		 := .c.VRegOf(.Return())

		// At this point, the ptr is ensured to be aligned, so using a normal load is atomic.
		// https://github.com/golang/go/blob/adead1a93f472affa97c494ef19f2f492ee6f34a/src/runtime/internal/atomic/atomic_amd64.go#L30
		 := newOperandMem(.lowerToAddressMode(, 0))
		 := .allocateInstr()
		switch  {
		case 8:
			.asMov64MR(, )
		case 4:
			.asMovzxRmR(extModeLQ, , )
		case 2:
			.asMovzxRmR(extModeWQ, , )
		case 1:
			.asMovzxRmR(extModeBQ, , )
		default:
			panic("BUG")
		}
		.insert()

	case ssa.OpcodeFence:
		.insert(.allocateInstr().asMFence())

	case ssa.OpcodeAtomicStore:
		,  := .Arg2()
		 := .AtomicTargetSize()

		 := .getOperand_Reg(.c.ValueDefinition())
		// The content on the val register will be overwritten by xchg, so we need to copy it to a temporary register.
		 := .copyToTmp(.reg())

		 := newOperandMem(.lowerToAddressMode(, 0))
		 := .allocateInstr().asXCHG(, , byte())
		.insert()

	case ssa.OpcodeAtomicCas:
		, ,  := .Arg3()
		 := .AtomicTargetSize()
		.lowerAtomicCas(, , , , .Return())

	case ssa.OpcodeAtomicRmw:
		,  := .Arg2()
		,  := .AtomicRmwData()
		.lowerAtomicRmw(, , , , .Return())

	default:
		panic("TODO: lowering " + .String())
	}
}

func ( *machine) ( ssa.AtomicRmwOp, ,  ssa.Value,  uint64,  ssa.Value) {
	 := .lowerToAddressMode(, 0)
	 := .getOperand_Reg(.c.ValueDefinition())

	switch  {
	case ssa.AtomicRmwOpAdd, ssa.AtomicRmwOpSub:
		 := .copyToTmp(.reg())
		if  == ssa.AtomicRmwOpSub {
			// Negate the value.
			.insert(.allocateInstr().asNeg(newOperandReg(), true))
		}
		.insert(.allocateInstr().asLockXAdd(, , byte()))
		.clearHigherBitsForAtomic(, , .Type())
		.copyTo(, .c.VRegOf())

	case ssa.AtomicRmwOpAnd, ssa.AtomicRmwOpOr, ssa.AtomicRmwOpXor:
		 := raxVReg
		// Reserve rax for the accumulator to make regalloc happy.
		// Note: do this initialization before defining valCopied, because it might be the same register and
		// if that happens, the unnecessary load/store will be performed inside the loop.
		// This can be mitigated in any way once the register allocator is clever enough.
		.insert(.allocateInstr().asDefineUninitializedReg())

		// Copy the value to a temporary register.
		 := .copyToTmp(.reg())
		.clearHigherBitsForAtomic(, , .Type())

		 := newOperandMem()
		 := .c.AllocateVReg(ssa.TypeI64)
		,  := .allocateBrTarget()
		{
			.insert()
			// Reset the value on tmp by the original value.
			.copyTo(, )
			// Load the current value at the memory location into accumulator.
			switch  {
			case 1:
				.insert(.allocateInstr().asMovzxRmR(extModeBQ, , ))
			case 2:
				.insert(.allocateInstr().asMovzxRmR(extModeWQ, , ))
			case 4:
				.insert(.allocateInstr().asMovzxRmR(extModeLQ, , ))
			case 8:
				.insert(.allocateInstr().asMov64MR(, ))
			default:
				panic("BUG")
			}
			// Then perform the logical operation on the accumulator and the value on tmp.
			switch  {
			case ssa.AtomicRmwOpAnd:
				.insert(.allocateInstr().asAluRmiR(aluRmiROpcodeAnd, newOperandReg(), , true))
			case ssa.AtomicRmwOpOr:
				.insert(.allocateInstr().asAluRmiR(aluRmiROpcodeOr, newOperandReg(), , true))
			case ssa.AtomicRmwOpXor:
				.insert(.allocateInstr().asAluRmiR(aluRmiROpcodeXor, newOperandReg(), , true))
			default:
				panic("BUG")
			}
			// Finally, try compare-exchange the value at the memory location with the tmp.
			.insert(.allocateInstr().asLockCmpXCHG(, .addressMode(), byte()))
			// If it succeeds, ZF will be set, and we can break the loop.
			.insert(.allocateInstr().asJmpIf(condNZ, newOperandLabel()))
		}

		// valCopied must be alive at the end of the loop.
		.insert(.allocateInstr().asNopUseReg())

		// At this point, accumulator contains the result.
		.clearHigherBitsForAtomic(, , .Type())
		.copyTo(, .c.VRegOf())

	case ssa.AtomicRmwOpXchg:
		 := .copyToTmp(.reg())

		.insert(.allocateInstr().asXCHG(, newOperandMem(), byte()))
		.clearHigherBitsForAtomic(, , .Type())
		.copyTo(, .c.VRegOf())

	default:
		panic("BUG")
	}
}

func ( *machine) (, ,  ssa.Value,  uint64,  ssa.Value) {
	 := .lowerToAddressMode(, 0)
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .getOperand_Reg(.c.ValueDefinition())

	 := raxVReg
	.copyTo(.reg(), )
	.insert(.allocateInstr().asLockCmpXCHG(.reg(), , byte()))
	.clearHigherBitsForAtomic(, , .Type())
	.copyTo(, .c.VRegOf())
}

func ( *machine) ( regalloc.VReg,  uint64,  ssa.Type) {
	switch  {
	case ssa.TypeI32:
		switch  {
		case 1:
			.insert(.allocateInstr().asMovzxRmR(extModeBL, newOperandReg(), ))
		case 2:
			.insert(.allocateInstr().asMovzxRmR(extModeWL, newOperandReg(), ))
		}
	case ssa.TypeI64:
		switch  {
		case 1:
			.insert(.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(), ))
		case 2:
			.insert(.allocateInstr().asMovzxRmR(extModeWQ, newOperandReg(), ))
		case 4:
			.insert(.allocateInstr().asMovzxRmR(extModeLQ, newOperandReg(), ))
		}
	}
}

func ( *machine) ( *ssa.Instruction) {
	, ,  := .lowerFcmpToFlags()
	 := .c.VRegOf(.Return())
	if  == condInvalid {
		 := .c.AllocateVReg(ssa.TypeI32)
		.insert(.allocateInstr().asSetcc(, ))
		// On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match
		// the semantics of Icmp that sets either 0 or 1.
		.insert(.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(), ))
	} else {
		,  := .c.AllocateVReg(ssa.TypeI32), .c.AllocateVReg(ssa.TypeI32)
		.insert(.allocateInstr().asSetcc(, ))
		.insert(.allocateInstr().asSetcc(, ))
		var  aluRmiROpcode
		if  {
			 = aluRmiROpcodeAnd
		} else {
			 = aluRmiROpcodeOr
		}
		.insert(.allocateInstr().asAluRmiR(, newOperandReg(), , false))
		.insert(.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(), ))
	}
}

func ( *machine) ( *ssa.Instruction) {
	, ,  := .IcmpData()
	.lowerIcmpToFlag(.c.ValueDefinition(), .c.ValueDefinition(), .Type() == ssa.TypeI64)
	 := .c.VRegOf(.Return())
	 := .c.AllocateVReg(ssa.TypeI32)
	.insert(.allocateInstr().asSetcc(condFromSSAIntCmpCond(), ))
	// On amd64, setcc only sets the first byte of the register, so we need to zero extend it to match
	// the semantics of Icmp that sets either 0 or 1.
	.insert(.allocateInstr().asMovzxRmR(extModeBQ, newOperandReg(), ))
}

func ( *machine) (, , ,  ssa.Value) {
	,  := .getOperand_Mem_Reg(.c.ValueDefinition()), .getOperand_Reg(.c.ValueDefinition())
	 := .c.VRegOf()

	var  cond
	 := .c.ValueDefinition()
	switch .c.MatchInstrOneOf(, condBranchMatches[:]) {
	case ssa.OpcodeIcmp:
		 := .Instr
		, ,  := .IcmpData()
		.lowerIcmpToFlag(.c.ValueDefinition(), .c.ValueDefinition(), .Type() == ssa.TypeI64)
		 = condFromSSAIntCmpCond()
		.Lowered()
	default: // TODO: match ssa.OpcodeFcmp for optimization, but seems a bit complex.
		 := .getOperand_Reg()
		 := .allocateInstr().asCmpRmiR(false, , .reg(), false)
		.insert()
		 = condNZ
	}

	if  := .Type(); .IsInt() {
		 := .Bits() == 64
		 := .allocateInstr()
		 := .c.AllocateVReg()
		switch .kind {
		case operandKindReg:
			.asMovRR(.reg(), , )
		case operandKindMem:
			if  {
				.asMov64MR(, )
			} else {
				.asMovzxRmR(extModeLQ, , )
			}
		default:
			panic("BUG")
		}
		.insert()
		 := .allocateInstr().asCmove(, , , )
		.insert()
		.insert(.allocateInstr().asMovRR(, , ))
	} else {
		 := .allocateInstr()
		 := .c.AllocateVReg()
		switch  {
		case ssa.TypeF32:
			.asXmmUnaryRmR(sseOpcodeMovss, , )
		case ssa.TypeF64:
			.asXmmUnaryRmR(sseOpcodeMovsd, , )
		case ssa.TypeV128:
			.asXmmUnaryRmR(sseOpcodeMovdqu, , )
		default:
			panic("BUG")
		}
		.insert()

		 := .allocateInstr().asXmmCMov(, , , .Size())
		.insert()

		.copyTo(, )
	}
}

func ( *machine) ( *instruction) {
	 := .op1
	 := .op2.reg()
	 := cond(.u1)

	 := .allocateInstr()
	.insert()

	 := .allocateInstr()
	switch .u2 {
	case 4:
		.asXmmUnaryRmR(sseOpcodeMovss, , )
	case 8:
		.asXmmUnaryRmR(sseOpcodeMovsd, , )
	case 16:
		.asXmmUnaryRmR(sseOpcodeMovdqu, , )
	default:
		panic("BUG")
	}
	.insert()

	,  := .allocateBrTarget()
	.insert()
	.asJmpIf(.invert(), newOperandLabel())
}

func ( *machine) (,  ssa.Value, ,  byte,  bool) {
	 := .c.VRegOf()
	 := .getOperand_Mem_Reg(.c.ValueDefinition())

	 := .c.AllocateVReg(.Type())

	 := .allocateInstr()
	switch {
	case  == 8 &&  == 16 && :
		.asMovsxRmR(extModeBQ, , )
	case  == 8 &&  == 16 && !:
		.asMovzxRmR(extModeBL, , )
	case  == 8 &&  == 32 && :
		.asMovsxRmR(extModeBL, , )
	case  == 8 &&  == 32 && !:
		.asMovzxRmR(extModeBQ, , )
	case  == 8 &&  == 64 && :
		.asMovsxRmR(extModeBQ, , )
	case  == 8 &&  == 64 && !:
		.asMovzxRmR(extModeBQ, , )
	case  == 16 &&  == 32 && :
		.asMovsxRmR(extModeWL, , )
	case  == 16 &&  == 32 && !:
		.asMovzxRmR(extModeWL, , )
	case  == 16 &&  == 64 && :
		.asMovsxRmR(extModeWQ, , )
	case  == 16 &&  == 64 && !:
		.asMovzxRmR(extModeWQ, , )
	case  == 32 &&  == 64 && :
		.asMovsxRmR(extModeLQ, , )
	case  == 32 &&  == 64 && !:
		.asMovzxRmR(extModeLQ, , )
	default:
		panic(fmt.Sprintf("BUG: unhandled extend: from=%d, to=%d, signed=%t", , , ))
	}
	.insert()

	.copyTo(, )
}

func ( *machine) ( regalloc.VReg, ,  uint64) {
	if  == 0 &&  == 0 {
		.insert(.allocateInstr().asZeros())
		return
	}

	 := .allocateInstr()
	,  := .allocateLabel()
	.consts = append(.consts, _const{label: , labelPos: , lo: , hi: })
	.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), )
	.insert()
}

func ( *machine) ( *ssa.Instruction) {
	if .cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
		.lowerUnaryRmR(, unaryRmROpcodeTzcnt)
	} else {
		// On processors that do not support TZCNT, the BSF instruction is
		// executed instead. The key difference between TZCNT and BSF
		// instruction is that if source operand is zero, the content of
		// destination operand is undefined.
		// https://www.felixcloutier.com/x86/tzcnt.html

		 := .Arg()
		if !.Type().IsInt() {
			panic("BUG?")
		}
		 := .Type().Bits() == 64

		 := .c.ValueDefinition()
		 := .c.AllocateVReg(.Type())
		 := .getOperand_Reg()

		// First, we have to check if the target is non-zero.
		 := .allocateInstr()
		.asCmpRmiR(false, , .reg(), )
		.insert()

		 := .allocateInstr()
		.insert()

		// If the value is zero, we just push the const value.
		.lowerIconst(, uint64(.Type().Bits()), )

		// Now jump right after the non-zero case.
		 := .allocateInstr()
		.insert()

		// jmpNz target label is set here.
		,  := .allocateBrTarget()
		.asJmpIf(condNZ, newOperandLabel())
		.insert()

		// Emit the non-zero case.
		 := .allocateInstr()
		.asUnaryRmR(unaryRmROpcodeBsf, , , )
		.insert()

		// jmpAtEnd target label is set here.
		,  := .allocateBrTarget()
		.asJmp(newOperandLabel())
		.insert()

		.copyTo(, .c.VRegOf(.Return()))
	}
}

func ( *machine) ( *ssa.Instruction) {
	if .cpuFeatures.HasExtra(platform.CpuExtraFeatureAmd64ABM) {
		.lowerUnaryRmR(, unaryRmROpcodeLzcnt)
	} else {
		// On processors that do not support LZCNT, we combine BSR (calculating
		// most significant set bit) with XOR. This logic is described in
		// "Replace Raw Assembly Code with Builtin Intrinsics" section in:
		// https://developer.apple.com/documentation/apple-silicon/addressing-architectural-differences-in-your-macos-code.

		 := .Arg()
		if !.Type().IsInt() {
			panic("BUG?")
		}
		 := .Type().Bits() == 64

		 := .c.ValueDefinition()
		 := .getOperand_Reg()
		 := .c.AllocateVReg(.Type())

		// First, we have to check if the rm is non-zero as BSR is undefined
		// on zero. See https://www.felixcloutier.com/x86/bsr.
		 := .allocateInstr()
		.asCmpRmiR(false, , .reg(), )
		.insert()

		 := .allocateInstr()
		.insert()

		// If the value is zero, we just push the const value.
		.lowerIconst(, uint64(.Type().Bits()), )

		// Now jump right after the non-zero case.
		 := .allocateInstr()
		.insert()

		// jmpNz target label is set here.
		,  := .allocateBrTarget()
		.asJmpIf(condNZ, newOperandLabel())
		.insert()

		// Emit the non-zero case.
		 := .allocateInstr()
		.asUnaryRmR(unaryRmROpcodeBsr, , , )
		.insert()

		// Now we XOR the value with the bit length minus one.
		 := .allocateInstr()
		.asAluRmiR(aluRmiROpcodeXor, newOperandImm32(uint32(.Type().Bits()-1)), , )
		.insert()

		// jmpAtEnd target label is set here.
		,  := .allocateBrTarget()
		.asJmp(newOperandLabel())
		.insert()

		.copyTo(, .c.VRegOf(.Return()))
	}
}

func ( *machine) ( *ssa.Instruction,  unaryRmROpcode) {
	 := .Arg()
	if !.Type().IsInt() {
		panic("BUG?")
	}
	 := .Type().Bits() == 64

	 := .c.ValueDefinition()
	 := .getOperand_Mem_Reg()
	 := .c.VRegOf(.Return())

	 := .allocateInstr()
	.asUnaryRmR(, , , )
	.insert()
}

func ( *machine) ( ssa.Value,  uint32,  ssa.Type,  regalloc.VReg) {
	 := newOperandMem(.lowerToAddressMode(, ))
	 := .allocateInstr()
	switch  {
	case ssa.TypeI32:
		.asMovzxRmR(extModeLQ, , )
	case ssa.TypeI64:
		.asMov64MR(, )
	case ssa.TypeF32:
		.asXmmUnaryRmR(sseOpcodeMovss, , )
	case ssa.TypeF64:
		.asXmmUnaryRmR(sseOpcodeMovsd, , )
	case ssa.TypeV128:
		.asXmmUnaryRmR(sseOpcodeMovdqu, , )
	default:
		panic("BUG")
	}
	.insert()
}

func ( *machine) ( ssa.Opcode,  ssa.Value,  uint32,  regalloc.VReg) {
	 := newOperandMem(.lowerToAddressMode(, ))
	 := .allocateInstr()
	switch  {
	case ssa.OpcodeUload8:
		.asMovzxRmR(extModeBQ, , )
	case ssa.OpcodeUload16:
		.asMovzxRmR(extModeWQ, , )
	case ssa.OpcodeUload32:
		.asMovzxRmR(extModeLQ, , )
	case ssa.OpcodeSload8:
		.asMovsxRmR(extModeBQ, , )
	case ssa.OpcodeSload16:
		.asMovsxRmR(extModeWQ, , )
	case ssa.OpcodeSload32:
		.asMovsxRmR(extModeLQ, , )
	default:
		panic("BUG")
	}
	.insert()
}

func ( *machine) ( regalloc.VReg,  ssa.Value,  wazevoapi.ExitCode) {
	 := .c.ValueDefinition()
	if !.c.MatchInstr(, ssa.OpcodeIcmp) {
		panic("TODO: ExitIfTrue must come after Icmp at the moment: " + .Instr.Opcode().String())
	}
	 := .Instr
	.MarkLowered()

	// We need to copy the execution context to a temp register, because if it's spilled,
	// it might end up being reloaded inside the exiting branch.
	 := .copyToTmp()

	, ,  := .IcmpData()
	,  := .c.ValueDefinition(), .c.ValueDefinition()
	if !.tryLowerBandToFlag(, ) {
		.lowerIcmpToFlag(, , .Type() == ssa.TypeI64)
	}

	 := .allocateInstr()
	.insert()
	 := .lowerExitWithCode(, )
	.asJmpIf(condFromSSAIntCmpCond().invert(), newOperandLabel())
}

func ( *machine) (,  backend.SSAValueDefinition) ( bool) {
	var  backend.SSAValueDefinition
	var  bool
	if .IsFromInstr() && .Instr.Constant() && .Instr.ConstantVal() == 0 {
		if .c.MatchInstr(, ssa.OpcodeBand) {
			 = 
			 = true
		}
	}

	if .IsFromInstr() && .Instr.Constant() && .Instr.ConstantVal() == 0 {
		if .c.MatchInstr(, ssa.OpcodeBand) {
			 = 
			 = true
		}
	}

	if ! {
		return false
	}

	 := .Instr
	,  := .Arg2()

	 := .getOperand_Reg(.c.ValueDefinition())
	 := .getOperand_Mem_Imm32_Reg(.c.ValueDefinition())
	 := .allocateInstr().asCmpRmiR(false, , .reg(), .Type() == ssa.TypeI64)
	.insert()
	.MarkLowered()
	return true
}

func ( *machine) (,  regalloc.VReg) (, ,  *instruction) {
	 = .allocateInstr().asMovRM(
		rspVReg,
		newOperandMem(.newAmodeImmReg(wazevoapi.ExecutionContextOffsetStackPointerBeforeGoCall.U32(), )),
		8,
	)

	 = .allocateInstr().asMovRM(
		rbpVReg,
		newOperandMem(.newAmodeImmReg(wazevoapi.ExecutionContextOffsetFramePointerBeforeGoCall.U32(), )),
		8,
	)
	 = .allocateInstr().asMovRM(
		,
		newOperandMem(.newAmodeImmReg(wazevoapi.ExecutionContextOffsetExitCodeOffset.U32(), )),
		4,
	)
	return
}

func ( *machine) ( regalloc.VReg,  wazevoapi.ExitCode) ( label) {
	 := rbpVReg
	, ,  := .allocateExitInstructions(, )

	// Set save RSP, RBP, and write exit code.
	.insert()
	.insert()
	.lowerIconst(, uint64(), false)
	.insert()

	 := rbpVReg

	// Next is to save the current address for stack unwinding.
	,  := .allocateBrTarget()
	.insert()
	 := .allocateInstr().asLEA(newOperandLabel(), )
	.insert()
	 := .allocateInstr().asMovRM(
		,
		newOperandMem(.newAmodeImmReg(wazevoapi.ExecutionContextOffsetGoCallReturnAddress.U32(), )),
		8,
	)
	.insert()

	// Finally exit.
	 := .allocateExitSeq()
	.insert()

	// Return the label for continuation.
	,  := .allocateBrTarget()
	.insert()
	return 
}

func ( *machine) ( *ssa.Instruction,  aluRmiROpcode) {
	,  := .Arg2()
	if !.Type().IsInt() {
		panic("BUG?")
	}

	 := .Type().Bits() == 64

	,  := .c.ValueDefinition(), .c.ValueDefinition()

	// TODO: commutative args can be swapped if one of them is an immediate.
	 := .getOperand_Reg()
	 := .getOperand_Mem_Imm32_Reg()
	 := .c.VRegOf(.Return())

	// rn is being overwritten, so we first copy its value to a temp register,
	// in case it is referenced again later.
	 := .copyToTmp(.reg())

	 := .allocateInstr()
	.asAluRmiR(, , , )
	.insert()

	// tmp now contains the result, we copy it to the dest register.
	.copyTo(, )
}

func ( *machine) ( *ssa.Instruction,  shiftROp) {
	,  := .Arg2()
	if !.Type().IsInt() {
		panic("BUG?")
	}
	 := .Type().Bits() == 64

	,  := .c.ValueDefinition(), .c.ValueDefinition()

	 := .getOperand_Imm32_Reg()
	 := .getOperand_Reg()
	 := .c.VRegOf(.Return())

	// rx is being overwritten, so we first copy its value to a temp register,
	// in case it is referenced again later.
	 := .copyToTmp(.reg())

	if .kind == operandKindReg {
		// If opAmt is a register we must copy its value to rcx,
		// because shiftR encoding mandates that the shift amount is in rcx.
		.copyTo(.reg(), rcxVReg)

		 := .allocateInstr()
		.asShiftR(, newOperandReg(rcxVReg), , )
		.insert()

	} else {
		 := .allocateInstr()
		.asShiftR(, , , )
		.insert()
	}

	// tmp now contains the result, we copy it to the dest register.
	.copyTo(, )
}

func ( *machine) ( *ssa.Instruction) {
	,  := .Arg2()
	if !.Type().IsFloat() {
		panic("BUG?")
	}
	 := .Type().Bits() == 64

	var  sseOpcode
	if  {
		switch .Opcode() {
		case ssa.OpcodeFadd:
			 = sseOpcodeAddsd
		case ssa.OpcodeFsub:
			 = sseOpcodeSubsd
		case ssa.OpcodeFmul:
			 = sseOpcodeMulsd
		case ssa.OpcodeFdiv:
			 = sseOpcodeDivsd
		default:
			panic("BUG")
		}
	} else {
		switch .Opcode() {
		case ssa.OpcodeFadd:
			 = sseOpcodeAddss
		case ssa.OpcodeFsub:
			 = sseOpcodeSubss
		case ssa.OpcodeFmul:
			 = sseOpcodeMulss
		case ssa.OpcodeFdiv:
			 = sseOpcodeDivss
		default:
			panic("BUG")
		}
	}

	,  := .c.ValueDefinition(), .c.ValueDefinition()
	 := .getOperand_Reg()
	 := .getOperand_Reg()
	 := .c.VRegOf(.Return())

	// rm is being overwritten, so we first copy its value to a temp register,
	// in case it is referenced again later.
	 := .copyToTmp(.reg())

	 := .allocateInstr().asXmmRmR(, , )
	.insert()

	.copyTo(, )
}

func ( *machine) ( *ssa.Instruction) {
	 := .Arg()
	if !.Type().IsFloat() {
		panic("BUG")
	}
	 := .Type().Bits() == 64
	var  sseOpcode
	if  {
		 = sseOpcodeSqrtsd
	} else {
		 = sseOpcodeSqrtss
	}

	 := .c.ValueDefinition()
	 := .getOperand_Mem_Reg()
	 := .c.VRegOf(.Return())

	 := .allocateInstr().asXmmUnaryRmR(, , )
	.insert()
}

func ( *machine) ( *ssa.Instruction) {
	 := .Arg()
	if !.Type().IsFloat() {
		panic("BUG")
	}
	 := .Type().Bits() == 64
	var  sseOpcode
	var  uint64
	if  {
		switch .Opcode() {
		case ssa.OpcodeFabs:
			,  = 0x7fffffffffffffff, sseOpcodeAndpd
		case ssa.OpcodeFneg:
			,  = 0x8000000000000000, sseOpcodeXorpd
		}
	} else {
		switch .Opcode() {
		case ssa.OpcodeFabs:
			,  = 0x7fffffff, sseOpcodeAndps
		case ssa.OpcodeFneg:
			,  = 0x80000000, sseOpcodeXorps
		}
	}

	 := .c.AllocateVReg(.Type())

	 := .c.ValueDefinition()
	 := .getOperand_Reg()
	 := .c.VRegOf(.Return())

	.lowerFconst(, , )

	 := .allocateInstr().asXmmRmR(, , )
	.insert()

	.copyTo(, )
}

func ( *machine) ( *ssa.Instruction) {
	, , ,  := .StoreData()
	 := .getOperand_Reg(.c.ValueDefinition())
	 := newOperandMem(.lowerToAddressMode(, ))

	 := .allocateInstr()
	switch .Type() {
	case ssa.TypeI32:
		.asMovRM(.reg(), , /8)
	case ssa.TypeI64:
		.asMovRM(.reg(), , /8)
	case ssa.TypeF32:
		.asXmmMovRM(sseOpcodeMovss, .reg(), )
	case ssa.TypeF64:
		.asXmmMovRM(sseOpcodeMovsd, .reg(), )
	case ssa.TypeV128:
		.asXmmMovRM(sseOpcodeMovdqu, .reg(), )
	default:
		panic("BUG")
	}
	.insert()
}

func ( *machine) ( *ssa.Instruction) {
	 := .Opcode() == ssa.OpcodeCall
	var  ssa.Value
	var  ssa.FuncRef
	var  ssa.SignatureID
	var  []ssa.Value
	var  bool
	if  {
		, ,  = .CallData()
	} else {
		, , ,  = .CallIndirectData()
	}
	 := .c.GetFunctionABI(.c.SSABuilder().ResolveSignature())

	 := int64(.AlignedArgResultStackSlotSize())
	if .maxRequiredStackSizeForCalls < +16 {
		.maxRequiredStackSizeForCalls =  + 16 // 16 == return address + RBP.
	}

	// Note: See machine.SetupPrologue for the stack layout.
	// The stack pointer decrease/increase will be inserted later in the compilation.

	for ,  := range  {
		 := .c.VRegOf()
		 := .c.ValueDefinition()
		.callerGenVRegToFunctionArg(, , , , )
	}

	if  {
		// Go's memmove *might* use all xmm0-xmm15, so we need to release them.
		// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/cmd/compile/abi-internal.md#architecture-specifics
		// https://github.com/golang/go/blob/49d42128fd8594c172162961ead19ac95e247d24/src/runtime/memmove_amd64.s#L271-L286
		for  := regalloc.RealReg(0);  < 16; ++ {
			.insert(.allocateInstr().asDefineUninitializedReg(regInfo.RealRegToVReg[xmm0+]))
		}
		// Since Go 1.24 it may also use DX, which is not reserved for the function call's 3 args.
		// https://github.com/golang/go/blob/go1.24.0/src/runtime/memmove_amd64.s#L123
		.insert(.allocateInstr().asDefineUninitializedReg(regInfo.RealRegToVReg[rdx]))
	}

	if  {
		 := .allocateInstr().asCall(, )
		.insert()
	} else {
		 := .getOperand_Mem_Reg(.c.ValueDefinition())
		 := .allocateInstr().asCallIndirect(, )
		.insert()
	}

	if  {
		for  := regalloc.RealReg(0);  < 16; ++ {
			.insert(.allocateInstr().asNopUseReg(regInfo.RealRegToVReg[xmm0+]))
		}
		.insert(.allocateInstr().asNopUseReg(regInfo.RealRegToVReg[rdx]))
	}

	var  int
	,  := .Returns()
	if .Valid() {
		.callerGenFunctionReturnVReg(, 0, .c.VRegOf(), )
		++
	}

	for ,  := range  {
		.callerGenFunctionReturnVReg(, , .c.VRegOf(), )
		++
	}
}

// callerGenVRegToFunctionArg is the opposite of GenFunctionArgToVReg, which is used to generate the
// caller side of the function call.
func ( *machine) ( *backend.FunctionABI,  int,  regalloc.VReg,  backend.SSAValueDefinition,  int64) {
	 := &.Args[]
	if .IsFromInstr() {
		// Constant instructions are inlined.
		if  := .Instr; .Constant() {
			.insertLoadConstant(, )
		}
	}
	if .Kind == backend.ABIArgKindReg {
		.InsertMove(.Reg, , .Type)
	} else {
		 := .allocateInstr()
		 := newOperandMem(.newAmodeImmReg(
			// -stackSlotSize because the stack pointer is not yet decreased.
			uint32(.Offset-), rspVReg))
		switch .Type {
		case ssa.TypeI32:
			.asMovRM(, , 4)
		case ssa.TypeI64:
			.asMovRM(, , 8)
		case ssa.TypeF32:
			.asXmmMovRM(sseOpcodeMovss, , )
		case ssa.TypeF64:
			.asXmmMovRM(sseOpcodeMovsd, , )
		case ssa.TypeV128:
			.asXmmMovRM(sseOpcodeMovdqu, , )
		default:
			panic("BUG")
		}
		.insert()
	}
}

func ( *machine) ( *backend.FunctionABI,  int,  regalloc.VReg,  int64) {
	 := &.Rets[]
	if .Kind == backend.ABIArgKindReg {
		.InsertMove(, .Reg, .Type)
	} else {
		 := .allocateInstr()
		 := newOperandMem(.newAmodeImmReg(
			// -stackSlotSize because the stack pointer is not yet decreased.
			uint32(.ArgStackSize+.Offset-), rspVReg))
		switch .Type {
		case ssa.TypeI32:
			.asMovzxRmR(extModeLQ, , )
		case ssa.TypeI64:
			.asMov64MR(, )
		case ssa.TypeF32:
			.asXmmUnaryRmR(sseOpcodeMovss, , )
		case ssa.TypeF64:
			.asXmmUnaryRmR(sseOpcodeMovsd, , )
		case ssa.TypeV128:
			.asXmmUnaryRmR(sseOpcodeMovdqu, , )
		default:
			panic("BUG")
		}
		.insert()
	}
}

// InsertMove implements backend.Machine.
func ( *machine) (,  regalloc.VReg,  ssa.Type) {
	switch  {
	case ssa.TypeI32, ssa.TypeI64:
		 := .allocateInstr().asMovRR(, , .Bits() == 64)
		.insert()
	case ssa.TypeF32, ssa.TypeF64, ssa.TypeV128:
		var  sseOpcode
		switch  {
		case ssa.TypeF32:
			 = sseOpcodeMovss
		case ssa.TypeF64:
			 = sseOpcodeMovsd
		case ssa.TypeV128:
			 = sseOpcodeMovdqa
		}
		 := .allocateInstr().asXmmUnaryRmR(, newOperandReg(), )
		.insert()
	default:
		panic("BUG")
	}
}

// Format implements backend.Machine.
func ( *machine) () string {
	 := map[*instruction]label{}
	for  := label(0);  < .nextLabel; ++ {
		 := .labelPositionPool.Get(int())
		if  != nil {
			[.begin] = 
		}
	}

	var  []string
	for  := .rootInstr;  != nil;  = .next {
		if ,  := [];  {
			var  string
			if  <= .maxSSABlockID {
				 = fmt.Sprintf("%s (SSA Block: blk%d):", , )
			} else {
				 = fmt.Sprintf("%s:", )
			}
			 = append(, )
		}
		if .kind == nop0 {
			continue
		}
		 = append(, "\t"+.String())
	}
	for ,  := range .consts {
		if ._var == nil {
			 = append(, fmt.Sprintf("%s: const [%d %d]", .label, .lo, .hi))
		} else {
			 = append(, fmt.Sprintf("%s: const %#x", .label, ._var))
		}
	}
	return "\n" + strings.Join(, "\n") + "\n"
}

func ( *machine) ( *instruction) {
	.labelResolutionPends = .labelResolutionPends[:0]
	 := .c.BufPtr()
	for  := ;  != nil;  = .next {
		 := int64(len(*))
		if .kind == nop0 {
			 := .nop0Label()
			 := .labelPositionPool.Get(int())
			if  != nil {
				.binaryOffset = 
			}
		}

		 := .encode(.c)
		if  {
			.labelResolutionPends = append(.labelResolutionPends,
				labelResolutionPend{instr: , imm32Offset: int64(len(*)) - 4},
			)
		}
	}

	for  := range .labelResolutionPends {
		 := &.labelResolutionPends[]
		switch .instr.kind {
		case jmp, jmpIf, lea:
			 := .instr.jmpLabel()
			 := .labelPositionPool.Get(int()).binaryOffset
			 := .imm32Offset
			 := int32( - (.imm32Offset + 4)) // +4 because RIP points to the next instruction.
			binary.LittleEndian.PutUint32((*)[:], uint32())
		default:
			panic("BUG")
		}
	}
}

// Encode implements backend.Machine Encode.
func ( *machine) ( context.Context) ( error) {
	 := .c.BufPtr()

	var  string
	var  int
	var  map[*labelPosition]label
	if wazevoapi.PerfMapEnabled {
		 = wazevoapi.GetCurrentFunctionName()
		 = make(map[*labelPosition]label)
		for  := 0;  <= .labelPositionPool.MaxIDEncountered(); ++ {
			 := .labelPositionPool.Get()
			[] = label()
		}
		 = wazevoapi.GetCurrentFunctionIndex()
	}

	.labelResolutionPends = .labelResolutionPends[:0]
	for ,  := range .orderedSSABlockLabelPos {
		 := int64(len(*))
		.binaryOffset = 
		for  := .begin;  != .end.next;  = .next {
			 := int64(len(*))

			switch .kind {
			case nop0:
				 := .nop0Label()
				if  := .labelPositionPool.Get(int());  != nil {
					.binaryOffset = 
				}
			case sourceOffsetInfo:
				.c.AddSourceOffsetInfo(, .sourceOffsetInfo())
			}

			 := .encode(.c)
			if  {
				.labelResolutionPends = append(.labelResolutionPends,
					labelResolutionPend{instr: , instrOffset: , imm32Offset: int64(len(*)) - 4},
				)
			}
		}

		if wazevoapi.PerfMapEnabled {
			 := []
			 := int64(len(*)) - 
			wazevoapi.PerfMap.AddModuleEntry(, , uint64(), fmt.Sprintf("%s:::::%s", , ))
		}
	}

	for  := range .consts {
		 := int64(len(*))
		 := &.consts[]
		.labelPos.binaryOffset = 
		if ._var == nil {
			,  := .lo, .hi
			.c.Emit8Bytes()
			.c.Emit8Bytes()
		} else {
			for ,  := range ._var {
				.c.EmitByte()
			}
		}
	}

	 := *
	for  := range .labelResolutionPends {
		 := &.labelResolutionPends[]
		switch .instr.kind {
		case jmp, jmpIf, lea, xmmUnaryRmR:
			 := .instr.jmpLabel()
			 := .labelPositionPool.Get(int()).binaryOffset
			 := .imm32Offset
			 := int32( - (.imm32Offset + 4)) // +4 because RIP points to the next instruction.
			binary.LittleEndian.PutUint32([:], uint32())
		case jmpTableIsland:
			 := .instrOffset
			// Each entry is the offset from the beginning of the jmpTableIsland instruction in 8 bytes.
			 := .jmpTableTargets[.instr.u1]
			for ,  := range  {
				 := .labelPositionPool.Get(int()).binaryOffset
				 :=  - 
				binary.LittleEndian.PutUint64([+int64()*8:], uint64())
			}
		default:
			panic("BUG")
		}
	}
	return
}

// ResolveRelocations implements backend.Machine.
func ( *machine) ( []int,  int,  []byte,  []backend.RelocationInfo,  []int) {
	for ,  := range  {
		 := .Offset
		 := [.FuncRef]
		// offset is the offset of the last 4 bytes of the call instruction.
		 := [ : +4]
		 := int64() - ( + 4) // +4 because we want the offset of the next instruction (In x64, RIP always points to the next instruction).
		[0] = byte()
		[1] = byte( >> 8)
		[2] = byte( >> 16)
		[3] = byte( >> 24)
	}
}

// CallTrampolineIslandInfo implements backend.Machine CallTrampolineIslandInfo.
func ( *machine) ( int) (,  int,  error) { return }

func ( *machine) (,  backend.SSAValueDefinition,  bool) {
	 := .getOperand_Reg()
	 := .getOperand_Mem_Imm32_Reg()
	 := .allocateInstr().asCmpRmiR(true, , .reg(), )
	.insert()
}

func ( *machine) ( *ssa.Instruction) (,  cond,  bool) {
	, ,  := .FcmpData()
	switch  {
	case ssa.FloatCmpCondEqual:
		,  = condNP, condZ
		 = true
	case ssa.FloatCmpCondNotEqual:
		,  = condP, condNZ
	case ssa.FloatCmpCondLessThan:
		 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThan)
		 = condInvalid
		,  = , 
	case ssa.FloatCmpCondLessThanOrEqual:
		 = condFromSSAFloatCmpCond(ssa.FloatCmpCondGreaterThanOrEqual)
		 = condInvalid
		,  = , 
	default:
		 = condFromSSAFloatCmpCond()
		 = condInvalid
	}

	var  sseOpcode
	if .Type() == ssa.TypeF32 {
		 = sseOpcodeUcomiss
	} else {
		 = sseOpcodeUcomisd
	}

	 := .getOperand_Reg(.c.ValueDefinition())
	 := .getOperand_Mem_Reg(.c.ValueDefinition())
	.insert(.allocateInstr().asXmmCmpRmR(, , .reg()))
	return
}

// allocateInstr allocates an instruction.
func ( *machine) () *instruction {
	 := .instrPool.Allocate()
	if !.regAllocStarted {
		.addedBeforeRegAlloc = true
	}
	return 
}

func ( *machine) () *instruction {
	 := .allocateInstr()
	.kind = nop0
	return 
}

func ( *machine) ( *instruction) {
	.pendingInstructions = append(.pendingInstructions, )
}

func ( *machine) () ( *instruction,  label) { //nolint
	,  := .allocateLabel()
	 = .allocateInstr()
	.asNop0WithLabel()
	.begin, .end = , 
	return
}

func ( *machine) () (label, *labelPosition) {
	 := .nextLabel
	 := .labelPositionPool.GetOrAllocate(int())
	.nextLabel++
	return , 
}

func ( *machine) ( regalloc.VRegID,  byte) int64 {
	,  := .spillSlots[]
	if ! {
		 = .spillSlotSize
		.spillSlots[] = 
		.spillSlotSize += int64()
	}
	return 
}

func ( *machine) ( regalloc.VReg,  regalloc.VReg) {
	 := .allocateInstr()
	if .RegType() == regalloc.RegTypeInt {
		.asMovRR(, , true)
	} else {
		.asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(), )
	}
	.insert()
}

func ( *machine) ( regalloc.VReg) regalloc.VReg {
	 := .c.TypeOf()
	 := .c.AllocateVReg()
	.copyTo(, )
	return 
}

func ( *machine) () int64 {
	return .maxRequiredStackSizeForCalls +
		.frameSize() +
		16 + // Need for stack checking.
		16 // return address and the caller RBP.
}

func ( *machine) () int64 {
	 := .clobberedRegSlotSize() + .spillSlotSize
	if &0xf != 0 {
		panic(fmt.Errorf("BUG: frame size %d is not 16-byte aligned", ))
	}
	return 
}

func ( *machine) () int64 {
	return int64(len(.clobberedRegs) * 16)
}

func ( *machine) ( *ssa.Instruction,  bool,  bool) {
	, ,  := .Arg3()

	 := .getOperand_Reg(.c.ValueDefinition())
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .c.VRegOf()
	 := .c.AllocateVReg(.Return().Type())

	.copyTo(.reg(), raxVReg)
	.insert(.allocateInstr().asDefineUninitializedReg(rdxVReg))
	.insert(.allocateInstr().asDefineUninitializedReg())
	 := .allocateInstr().asIdivRemSequence(, .reg(), , , , .Type().Bits() == 64)
	.insert()
	 := .c.VRegOf(.Return())
	if  {
		.copyTo(raxVReg, )
	} else {
		.copyTo(rdxVReg, )
	}
}

func ( *machine) ( *instruction) {
	, , , , ,  := .idivRemSequenceData()

	 := raxVReg

	// Ensure yr is not zero.
	 := .allocateInstr()
	.asCmpRmiR(false, newOperandReg(), , )
	.insert()

	 := .allocateInstr()
	.insert()

	 := .lowerExitWithCode(, wazevoapi.ExitCodeIntegerDivisionByZero)

	// If not zero, we can proceed with the division.
	.asJmpIf(condNZ, newOperandLabel())

	var  *instruction
	if  {
		var  uint64
		if  {
			 = 0xffffffffffffffff
		} else {
			 = 0xffffffff
		}
		.lowerIconst(, , )

		if  {
			// For signed division, we have to have branches for "math.MinInt{32,64} / -1"
			// case which results in the floating point exception via division error as
			// the resulting value exceeds the maximum of signed int.

			// First, we check if the divisor is -1.
			 := .allocateInstr()
			.asCmpRmiR(true, newOperandReg(), , )
			.insert()

			 := .allocateInstr()
			.insert()

			var  uint64
			if  {
				 = 0x8000000000000000
			} else {
				 = 0x80000000
			}
			.lowerIconst(, , )

			// Next we check if the quotient is the most negative value for the signed integer, i.e.
			// if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1) respectively.
			 := .allocateInstr()
			.asCmpRmiR(true, newOperandReg(), , )
			.insert()

			 := .allocateInstr()
			.insert()

			// Trap if we are trying to do (math.MinInt32 / -1) or (math.MinInt64 / -1),
			// as that is the overflow in division as the result becomes 2^31 which is larger than
			// the maximum of signed 32-bit int (2^31-1).
			 := .lowerExitWithCode(, wazevoapi.ExitCodeIntegerOverflow)
			.asJmpIf(condNZ, newOperandLabel())
			.asJmpIf(condNZ, newOperandLabel())
		} else {
			// If it is remainder, zeros DX register and compare the divisor to -1.
			 := .allocateInstr().asZeros(rdxVReg)
			.insert()

			// We check if the divisor is -1.
			 := .allocateInstr()
			.asCmpRmiR(true, newOperandReg(), , )
			.insert()

			 = .allocateInstr()
			.insert()
		}

		// Sign-extend DX register to have 2*x.Type().Bits() dividend over DX and AX registers.
		 := .allocateInstr()
		.asSignExtendData()
		.insert()
	} else {
		// Zeros DX register to have 2*x.Type().Bits() dividend over DX and AX registers.
		 := .allocateInstr().asZeros(rdxVReg)
		.insert()
	}

	 := .allocateInstr()
	.asDiv(newOperandReg(), , )
	.insert()

	,  := .allocateBrTarget()
	.insert()
	// If we are compiling a Rem instruction, when the divisor is -1 we land at the end of the function.
	if  != nil {
		.asJmpIf(condZ, newOperandLabel())
	}
}

func ( *machine) ( *ssa.Instruction,  roundingMode) {
	 := .Arg()
	if !.Type().IsFloat() {
		panic("BUG?")
	}
	var  sseOpcode
	if .Type().Bits() == 64 {
		 = sseOpcodeRoundsd
	} else {
		 = sseOpcodeRoundss
	}

	 := .c.ValueDefinition()
	 := .getOperand_Mem_Reg()
	 := .c.VRegOf(.Return())

	 := .allocateInstr().asXmmUnaryRmRImm(, uint8(), , )
	.insert()
}

func ( *machine) ( *ssa.Instruction) {
	,  := .Arg2()
	if !.Type().IsFloat() {
		panic("BUG?")
	}

	 := .Type().Bits() == 64
	 := .Opcode() == ssa.OpcodeFmin
	var  sseOpcode

	switch {
	case  && :
		 = sseOpcodeMinpd
	case  && !:
		 = sseOpcodeMaxpd
	case ! && :
		 = sseOpcodeMinps
	case ! && !:
		 = sseOpcodeMaxps
	}

	,  := .c.ValueDefinition(), .c.ValueDefinition()
	 := .getOperand_Reg()
	// We cannot ensure that y is aligned to 16 bytes, so we have to use it on reg.
	 := .getOperand_Reg()
	 := .c.VRegOf(.Return())

	 := .copyToTmp(.reg())

	// Check if this is (either x1 or x2 is NaN) or (x1 equals x2) case.
	 := .allocateInstr()
	if  {
		.asXmmCmpRmR(sseOpcodeUcomisd, , )
	} else {
		.asXmmCmpRmR(sseOpcodeUcomiss, , )
	}
	.insert()

	// At this point, we have the three cases of conditional flags below
	// (See https://www.felixcloutier.com/x86/ucomiss#operation for detail.)
	//
	// 1) Two values are NaN-free and different: All flags are cleared.
	// 2) Two values are NaN-free and equal: Only ZF flags is set.
	// 3) One of Two values is NaN: ZF, PF and CF flags are set.

	// Jump instruction to handle 1) case by checking the ZF flag
	// as ZF is only set for 2) and 3) cases.
	 := .allocateInstr()
	.insert()

	// Start handling 2) and 3).

	// Jump if one of two values is NaN by checking the parity flag (PF).
	 := .allocateInstr()
	.insert()

	// Start handling 2) NaN-free and equal.

	// Before we exit this case, we have to ensure that positive zero (or negative zero for min instruction) is
	// returned if two values are positive and negative zeros.
	var  sseOpcode
	switch {
	case ! && :
		 = sseOpcodeOrps
	case  && :
		 = sseOpcodeOrpd
	case ! && !:
		 = sseOpcodeAndps
	case  && !:
		 = sseOpcodeAndpd
	}
	 := .allocateInstr()
	.asXmmRmR(, , )
	.insert()

	// Done, jump to end.
	 := .allocateInstr()
	.insert()

	// Start handling 3) either is NaN.
	,  := .allocateBrTarget()
	.insert()
	.asJmpIf(condP, newOperandLabel())

	// We emit the ADD instruction to produce the NaN in tmp.
	 := .allocateInstr()
	if  {
		.asXmmRmR(sseOpcodeAddsd, , )
	} else {
		.asXmmRmR(sseOpcodeAddss, , )
	}
	.insert()

	// Exit from the NaN case branch.
	 := .allocateInstr()
	.insert()

	// Start handling 1).
	,  := .allocateBrTarget()
	.insert()
	.asJmpIf(condNZ, newOperandLabel())

	// Now handle the NaN-free and different values case.
	 := .allocateInstr()
	.asXmmRmR(, , )
	.insert()

	,  := .allocateBrTarget()
	.insert()
	.asJmp(newOperandLabel())
	.asJmp(newOperandLabel())

	.copyTo(, )
}

func ( *machine) ( *ssa.Instruction) {
	,  := .Arg2()
	if !.Type().IsFloat() {
		panic("BUG")
	}

	 := .Type().Bits() == 64

	,  := .c.ValueDefinition(), .c.ValueDefinition()
	 := .getOperand_Reg()
	 := .getOperand_Reg()
	 := .c.VRegOf(.Return())

	// Clear the non-sign bits of src via AND with the mask.
	var ,  sseOpcode
	var  uint64
	if  {
		, ,  = 0x8000000000000000, sseOpcodeAndpd, sseOpcodeOrpd
	} else {
		, ,  = 0x80000000, sseOpcodeAndps, sseOpcodeOrps
	}

	 := .c.AllocateVReg(.Type())
	.lowerFconst(, , )
	 := .c.AllocateVReg(.Type())
	.lowerFconst(, ^, )

	// Extract the sign bits of rn.
	 := .allocateInstr().asXmmRmR(, , )
	.insert()

	// Clear the sign bit of dst via AND with the non-sign bit mask.
	 := .allocateInstr().asXmmRmR(, , )
	.insert()

	// Copy the sign bits of src to dst via OR.
	 := .allocateInstr().asXmmRmR(, newOperandReg(), )
	.insert()

	.copyTo(, )
}

func ( *machine) ( *ssa.Instruction) {
	,  := .BitcastData()
	 := .Type()
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .c.VRegOf(.Return())
	switch {
	case  == ssa.TypeF32 &&  == ssa.TypeI32:
		 := .allocateInstr().asXmmToGpr(sseOpcodeMovd, .reg(), , false)
		.insert()
	case  == ssa.TypeI32 &&  == ssa.TypeF32:
		 := .allocateInstr().asGprToXmm(sseOpcodeMovd, , , false)
		.insert()
	case  == ssa.TypeF64 &&  == ssa.TypeI64:
		 := .allocateInstr().asXmmToGpr(sseOpcodeMovq, .reg(), , true)
		.insert()
	case  == ssa.TypeI64 &&  == ssa.TypeF64:
		 := .allocateInstr().asGprToXmm(sseOpcodeMovq, , , true)
		.insert()
	default:
		panic(fmt.Sprintf("invalid bitcast from %s to %s", , ))
	}
}

func ( *machine) (, ,  regalloc.VReg, , ,  bool) {
	var  regalloc.VReg
	if  {
		 = .c.AllocateVReg(ssa.TypeF64)
	} else {
		 = .c.AllocateVReg(ssa.TypeF32)
	}

	.insert(.allocateInstr().asDefineUninitializedReg())
	,  := .c.AllocateVReg(ssa.TypeI64), .c.AllocateVReg(ssa.TypeI64)
	.insert(.allocateInstr().asDefineUninitializedReg())
	.insert(.allocateInstr().asDefineUninitializedReg())

	.insert(.allocateFcvtToSintSequence(, , , , , , , ))
	.copyTo(, )
}

func ( *machine) ( *instruction) {
	, , , , , , ,  := .fcvtToSintSequenceData()
	var ,  sseOpcode
	if  {
		,  = sseOpcodeUcomisd, sseOpcodeCvttsd2si
	} else {
		,  = sseOpcodeUcomiss, sseOpcodeCvttss2si
	}

	 := .allocateInstr()
	.asXmmToGpr(, , , )
	.insert()

	// Check if the dst operand was INT_MIN, by checking it against 1.
	 := .allocateInstr()
	.asCmpRmiR(true, newOperandImm32(1), , )
	.insert()

	// If no overflow, then we are done.
	,  := .allocateBrTarget()
	 := .allocateInstr()
	.asJmpIf(condNO, newOperandLabel())
	.insert()

	// Now, check for NaN.
	 := .allocateInstr()
	.asXmmCmpRmR(, newOperandReg(), )
	.insert()

	// We allocate the "non-nan target" here, but we will insert it later.
	,  := .allocateBrTarget()
	 := .allocateInstr()
	.asJmpIf(condNP, newOperandLabel())
	.insert()

	if  {
		// If NaN and saturating, return 0.
		 := .allocateInstr().asZeros()
		.insert()

		 := .allocateInstr()
		.asJmp(newOperandLabel())
		.insert()

		// Otherwise:
		.insert()

		// Zero-out the tmp register.
		 := .allocateInstr().asZeros()
		.insert()

		 := .allocateInstr().asXmmCmpRmR(, newOperandReg(), )
		.insert()

		// if >= jump to end.
		 := .allocateInstr()
		.asJmpIf(condB, newOperandLabel())
		.insert()

		// Otherwise, saturate to INT_MAX.
		if  {
			.lowerIconst(, math.MaxInt64, )
		} else {
			.lowerIconst(, math.MaxInt32, )
		}

	} else {

		// If non-sat, NaN, trap.
		.lowerExitWithCode(, wazevoapi.ExitCodeInvalidConversionToInteger)

		// Otherwise, we will jump here.
		.insert()

		// jump over trap if src larger than threshold
		 := condNB

		// The magic constants are various combination of minInt for int[32|64] represented as float[32|64].
		var  uint64
		switch {
		case  && :
			 = 0xc3e0000000000000
		case  && !:
			 = condNBE
			 = 0xC1E0_0000_0020_0000
		case ! && :
			 = 0xDF00_0000
		case ! && !:
			 = 0xCF00_0000
		}

		 := .allocateInstr().asImm(, , )
		.insert()

		 := .allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(), , )
		.insert()

		 := .allocateInstr().asXmmCmpRmR(, newOperandReg(), )
		.insert()

		 := .allocateInstr()
		,  := .allocateBrTarget()
		.asJmpIf(, newOperandLabel())
		.insert()

		.lowerExitWithCode(, wazevoapi.ExitCodeIntegerOverflow)

		// If positive, it was a real overflow.
		.insert()

		// Zero out the temp register.
		 := .allocateInstr()
		.asXmmRmR(sseOpcodeXorpd, newOperandReg(), )
		.insert()

		 := .allocateInstr()
		.asXmmCmpRmR(, newOperandReg(), )
		.insert()

		// If >= jump to end.
		 := .allocateInstr().asJmpIf(condNB, newOperandLabel())
		.insert()
		.lowerExitWithCode(, wazevoapi.ExitCodeIntegerOverflow)
	}

	.insert()
}

func ( *machine) (, ,  regalloc.VReg, , ,  bool) {
	,  := .c.AllocateVReg(ssa.TypeF64), .c.AllocateVReg(ssa.TypeF64)
	.insert(.allocateInstr().asDefineUninitializedReg())
	.insert(.allocateInstr().asDefineUninitializedReg())
	,  := .c.AllocateVReg(ssa.TypeI64), .c.AllocateVReg(ssa.TypeI64)
	.insert(.allocateInstr().asDefineUninitializedReg())
	.insert(.allocateInstr().asDefineUninitializedReg())

	.insert(.allocateFcvtToUintSequence(
		, , , , , , , , ,
	))
	.copyTo(, )
}

func ( *machine) ( *instruction) {
	, , , , , , , ,  := .fcvtToUintSequenceData()

	var , ,  sseOpcode
	if  {
		, ,  = sseOpcodeSubsd, sseOpcodeUcomisd, sseOpcodeCvttsd2si
	} else {
		, ,  = sseOpcodeSubss, sseOpcodeUcomiss, sseOpcodeCvttss2si
	}

	,  := .allocateBrTarget()

	switch {
	case  && :
		 := .allocateInstr().asImm(, 0x43e0000000000000, true)
		.insert()
		 := .allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(), , true)
		.insert()
	case  && !:
		 := .allocateInstr().asImm(, 0x41e0000000000000, true)
		.insert()
		 := .allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(), , true)
		.insert()
	case ! && :
		 := .allocateInstr().asImm(, 0x5f000000, false)
		.insert()
		 := .allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(), , false)
		.insert()
	case ! && !:
		 := .allocateInstr().asImm(, 0x4f000000, false)
		.insert()
		 := .allocateInstr().asGprToXmm(sseOpcodeMovq, newOperandReg(), , false)
		.insert()
	}

	 := .allocateInstr()
	.asXmmCmpRmR(, newOperandReg(), )
	.insert()

	// If above `tmp` ("large threshold"), jump to `ifAboveThreshold`
	,  := .allocateBrTarget()
	 := .allocateInstr()
	.asJmpIf(condNB, newOperandLabel())
	.insert()

	,  := .allocateBrTarget()
	 := .allocateInstr()
	.asJmpIf(condNP, newOperandLabel())
	.insert()

	// If NaN, handle the error condition.
	if  {
		// On NaN, saturating, we just return 0.
		 := .allocateInstr().asZeros()
		.insert()

		 := .allocateInstr()
		.asJmp(newOperandLabel())
		.insert()
	} else {
		// On NaN, non-saturating, we trap.
		.lowerExitWithCode(, wazevoapi.ExitCodeInvalidConversionToInteger)
	}

	// If not NaN, land here.
	.insert()

	// Truncation happens here.

	 := .allocateInstr()
	.asXmmToGpr(, , , )
	.insert()

	// Check if the result is negative.
	 := .allocateInstr()
	.asCmpRmiR(true, newOperandImm32(0), , )
	.insert()

	// If non-neg, jump to end.
	 := .allocateInstr()
	.asJmpIf(condNL, newOperandLabel())
	.insert()

	if  {
		// If the input was "small" (< 2**(width -1)), the only way to get an integer
		// overflow is because the input was too small: saturate to the min value, i.e. 0.
		 := .allocateInstr().asZeros()
		.insert()

		 := .allocateInstr()
		.asJmp(newOperandLabel())
		.insert()
	} else {
		// If not saturating, trap.
		.lowerExitWithCode(, wazevoapi.ExitCodeIntegerOverflow)
	}

	// If above the threshold, land here.
	.insert()

	// tmpDiff := threshold - rn.
	 := .allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandReg(), )
	.insert()

	 := .allocateInstr()
	.asXmmRmR(, newOperandReg(), ) // must be -0x8000000000000000
	.insert()

	 := .allocateInstr()
	.asXmmToGpr(, , , )
	.insert()

	// Check if the result is negative.
	 := .allocateInstr().asCmpRmiR(true, newOperandImm32(0), , )
	.insert()

	,  := .allocateBrTarget()
	 := .allocateInstr()
	.asJmpIf(condNL, newOperandLabel())
	.insert()

	if  {
		// The input was "large" (>= maxInt), so the only way to get an integer
		// overflow is because the input was too large: saturate to the max value.
		var  uint64
		if  {
			 = math.MaxUint64
		} else {
			 = math.MaxUint32
		}
		.lowerIconst(, , )

		 := .allocateInstr()
		.asJmp(newOperandLabel())
		.insert()
	} else {
		// If not saturating, trap.
		.lowerExitWithCode(, wazevoapi.ExitCodeIntegerOverflow)
	}

	.insert()

	var  operand
	if  {
		.lowerIconst(, 0x8000000000000000, true)
		 = newOperandReg()
	} else {
		 = newOperandImm32(0x80000000)
	}

	 := .allocateInstr()
	.asAluRmiR(aluRmiROpcodeAdd, , , )
	.insert()

	.insert()
}

func ( *machine) (,  operand, ,  bool) {
	var  sseOpcode
	if  {
		 = sseOpcodeCvtsi2sd
	} else {
		 = sseOpcodeCvtsi2ss
	}

	 := .allocateInstr()
	.asGprToXmm(, , .reg(), )
	.insert()
}

func ( *machine) (,  operand, ,  bool) {
	var  sseOpcode
	if  {
		 = sseOpcodeCvtsi2sd
	} else {
		 = sseOpcodeCvtsi2ss
	}

	// Src is 32 bit, then we just perform the conversion with 64 bit width.
	//
	// See the following link for why we use 64bit conversion for unsigned 32bit integer sources:
	// https://stackoverflow.com/questions/41495498/fpu-operations-generated-by-gcc-during-casting-integer-to-float.
	//
	// Here's the summary:
	// >> CVTSI2SS is indeed designed for converting a signed integer to a scalar single-precision float,
	// >> not an unsigned integer like you have here. So what gives? Well, a 64-bit processor has 64-bit wide
	// >> registers available, so the unsigned 32-bit input values can be stored as signed 64-bit intermediate values,
	// >> which allows CVTSI2SS to be used after all.
	//
	if ! {
		// Before we convert, we have to clear the higher 32-bits of the 64-bit register
		// to get the correct result.
		 := .c.AllocateVReg(ssa.TypeI32)
		.insert(.allocateInstr().asMovzxRmR(extModeLQ, , ))
		.insert(.allocateInstr().asGprToXmm(, newOperandReg(), .reg(), true))
		return
	}

	// If uint64, we have to do a bit more work.
	,  := .allocateBrTarget()

	var  regalloc.VReg
	if  {
		 = .c.AllocateVReg(ssa.TypeF64)
	} else {
		 = .c.AllocateVReg(ssa.TypeF32)
	}

	// Check if the most significant bit (sign bit) is set.
	 := .allocateInstr()
	.asCmpRmiR(false, , .reg(), )
	.insert()

	// Jump if the sign bit is set.
	,  := .allocateBrTarget()
	 := .allocateInstr()
	.asJmpIf(condS, newOperandLabel())
	.insert()

	// If the sign bit is not set, we could fit the unsigned int into float32/float64.
	// So, we convert it to float and emit jump instruction to exit from this branch.
	 := .allocateInstr()
	.asGprToXmm(, , , )
	.insert()

	// We are done, jump to end.
	 := .allocateInstr()
	.asJmp(newOperandLabel())
	.insert()

	// Now handling the case where sign-bit is set.
	// We emit the following sequences:
	// 	   mov      %rn, %tmp
	// 	   shr      1, %tmp
	// 	   mov      %rn, %tmp2
	// 	   and      1, %tmp2
	// 	   or       %tmp2, %tmp
	// 	   cvtsi2ss %tmp, %xmm0
	// 	   addsd    %xmm0, %xmm0
	.insert()

	 := .copyToTmp(.reg())
	 := .allocateInstr()
	.asShiftR(shiftROpShiftRightLogical, newOperandImm32(1), , )
	.insert()

	 := .copyToTmp(.reg())
	 := .allocateInstr()
	.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), , )
	.insert()

	 := .allocateInstr()
	.asAluRmiR(aluRmiROpcodeOr, newOperandReg(), , )
	.insert()

	 := .allocateInstr()
	.asGprToXmm(, newOperandReg(), , )
	.insert()

	 := .allocateInstr()
	if  {
		.asXmmRmR(sseOpcodeAddsd, newOperandReg(), )
	} else {
		.asXmmRmR(sseOpcodeAddss, newOperandReg(), )
	}
	.insert()

	.insert()
	.copyTo(, .reg())
}

func ( *machine) ( *ssa.Instruction) {
	 := .Arg()
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .c.VRegOf(.Return())

	 := .c.AllocateVReg(ssa.TypeI32)

	 := .allocateInstr()
	.asXmmCmpRmR(sseOpcodePtest, , .reg())
	.insert()

	 := .allocateInstr()
	.asSetcc(condNZ, )
	.insert()

	// Clear the irrelevant bits.
	 := .allocateInstr()
	.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), , false)
	.insert()

	.copyTo(, )
}

func ( *machine) ( *ssa.Instruction) {
	,  := .ArgWithLane()
	var  sseOpcode
	switch  {
	case ssa.VecLaneI8x16:
		 = sseOpcodePcmpeqb
	case ssa.VecLaneI16x8:
		 = sseOpcodePcmpeqw
	case ssa.VecLaneI32x4:
		 = sseOpcodePcmpeqd
	case ssa.VecLaneI64x2:
		 = sseOpcodePcmpeqq
	}
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .c.VRegOf(.Return())

	 := .c.AllocateVReg(ssa.TypeV128)

	 := .allocateInstr()
	.asZeros()
	.insert()

	 := .allocateInstr()
	.asXmmRmR(, , )
	.insert()

	 := .allocateInstr()
	.asXmmCmpRmR(sseOpcodePtest, newOperandReg(), )
	.insert()

	 := .c.AllocateVReg(ssa.TypeI32)

	 := .allocateInstr()
	.asSetcc(condZ, )
	.insert()

	// Clear the irrelevant bits.
	 := .allocateInstr()
	.asAluRmiR(aluRmiROpcodeAnd, newOperandImm32(1), , false)
	.insert()

	.copyTo(, )
}

func ( *machine) ( *ssa.Instruction) {
	,  := .ArgWithLane()
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .c.VRegOf(.Return())
	switch  {
	case ssa.VecLaneI8x16:
		 := .allocateInstr()
		.asXmmToGpr(sseOpcodePmovmskb, .reg(), , false)
		.insert()

	case ssa.VecLaneI16x8:
		// When we have:
		// 	R1 = [R1(w1), R1(w2), R1(w3), R1(w4), R1(w5), R1(w6), R1(w7), R1(v8)]
		// 	R2 = [R2(w1), R2(w2), R2(w3), R2(v4), R2(w5), R2(w6), R2(w7), R2(v8)]
		//	where RX(wn) is n-th signed word (16-bit) of RX register,
		//
		// "PACKSSWB R1, R2" produces
		//  R1 = [
		// 		byte_sat(R1(w1)), byte_sat(R1(w2)), byte_sat(R1(w3)), byte_sat(R1(w4)),
		// 		byte_sat(R1(w5)), byte_sat(R1(w6)), byte_sat(R1(w7)), byte_sat(R1(w8)),
		// 		byte_sat(R2(w1)), byte_sat(R2(w2)), byte_sat(R2(w3)), byte_sat(R2(w4)),
		// 		byte_sat(R2(w5)), byte_sat(R2(w6)), byte_sat(R2(w7)), byte_sat(R2(w8)),
		//  ]
		//  where R1 is the destination register, and
		// 	byte_sat(w) = int8(w) if w fits as signed 8-bit,
		//                0x80 if w is less than 0x80
		//                0x7F if w is greater than 0x7f
		//
		// See https://www.felixcloutier.com/x86/packsswb:packssdw for detail.
		//
		// Therefore, v.register ends up having i-th and (i+8)-th bit set if i-th lane is negative (for i in 0..8).
		 := .copyToTmp(.reg())
		 := .c.AllocateVReg(ssa.TypeI32)

		 := .allocateInstr()
		.asXmmRmR(sseOpcodePacksswb, , )
		.insert()

		 := .allocateInstr()
		.asXmmToGpr(sseOpcodePmovmskb, , , false)
		.insert()

		// Clear the higher bits than 8.
		 := .allocateInstr()
		.asShiftR(shiftROpShiftRightLogical, newOperandImm32(8), , false)
		.insert()

		.copyTo(, )

	case ssa.VecLaneI32x4:
		 := .allocateInstr()
		.asXmmToGpr(sseOpcodeMovmskps, .reg(), , true)
		.insert()

	case ssa.VecLaneI64x2:
		 := .allocateInstr()
		.asXmmToGpr(sseOpcodeMovmskpd, .reg(), , true)
		.insert()
	}
}

func ( *machine) ( *ssa.Instruction) {
	 := .Arg()
	 := .c.ValueDefinition()
	 := .getOperand_Reg()
	 := .c.VRegOf(.Return())

	 := .copyToTmp(.reg())
	 := .c.AllocateVReg(ssa.TypeV128)

	// Ensure tmp2 is considered defined by regalloc.
	.insert(.allocateInstr().asDefineUninitializedReg())

	// Set all bits on tmp register.
	 := .allocateInstr()
	.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(), )
	.insert()

	// Then XOR with tmp to reverse all bits on v.register.
	 := .allocateInstr()
	.asXmmRmR(sseOpcodePxor, newOperandReg(), )
	.insert()

	.copyTo(, )
}

func ( *machine) (,  ssa.Value,  ssa.VecLane) {
	 := .c.AllocateVReg(ssa.TypeV128)
	.insert(.allocateInstr().asDefineUninitializedReg())

	switch  {
	case ssa.VecLaneI8x16:
		 := .c.AllocateVReg(ssa.TypeV128)
		.insert(.allocateInstr().asDefineUninitializedReg())
		 := .getOperand_Mem_Reg(.c.ValueDefinition())
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrb, 0, , ))
		.insert(.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(), ))
		.insert(.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(), ))
	case ssa.VecLaneI16x8:
		 := .getOperand_Reg(.c.ValueDefinition())
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 0, , ))
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrw, 1, , ))
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(), ))
	case ssa.VecLaneI32x4:
		 := .getOperand_Mem_Reg(.c.ValueDefinition())
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrd, 0, , ))
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(), ))
	case ssa.VecLaneI64x2:
		 := .getOperand_Reg(.c.ValueDefinition())
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 0, , ))
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePinsrq, 1, , ))
	case ssa.VecLaneF32x4:
		 := .getOperand_Mem_Reg(.c.ValueDefinition())
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodeInsertps, 0, , ))
		.insert(.allocateInstr().asXmmRmRImm(sseOpcodePshufd, 0, newOperandReg(), ))
	case ssa.VecLaneF64x2:
		 := .getOperand_Reg(.c.ValueDefinition())
		.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovsd, , ))
		.insert(.allocateInstr().asXmmRmR(sseOpcodeMovlhps, , ))
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}

	.copyTo(, .c.VRegOf())
}

func ( *machine) (,  ssa.Value, ,  uint64,  ssa.Value) {
	var ,  [2]uint64
	for  := 0;  < 8; ++ {
		 := byte( >> ( * 8))
		if  < 16 {
			[0] |= uint64() << ( * 8)
			[0] |= uint64(0x80) << ( * 8)
		} else {
			[0] |= uint64(0x80) << ( * 8)
			[0] |= uint64(-16) << ( * 8)
		}
		 := byte( >> ( * 8))
		if  < 16 {
			[1] |= uint64() << ( * 8)
			[1] |= uint64(0x80) << ( * 8)
		} else {
			[1] |= uint64(0x80) << ( * 8)
			[1] |= uint64(-16) << ( * 8)
		}
	}

	,  := .allocateLabel()
	.consts = append(.consts, _const{lo: [0], hi: [1], label: , labelPos: })
	,  := .allocateLabel()
	.consts = append(.consts, _const{lo: [0], hi: [1], label: , labelPos: })

	,  := .getOperand_Reg(.c.ValueDefinition()), .getOperand_Reg(.c.ValueDefinition())
	,  := .copyToTmp(.reg()), .copyToTmp(.reg())

	// Apply mask to X.
	 := .c.AllocateVReg(ssa.TypeV128)
	 := .allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), )
	.insert()
	.insert(.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(), ))

	// Apply mask to Y.
	 := .allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, newOperandMem(.newAmodeRipRel()), )
	.insert()
	.insert(.allocateInstr().asXmmRmR(sseOpcodePshufb, newOperandReg(), ))

	// Combine the results.
	.insert(.allocateInstr().asXmmRmR(sseOpcodeOrps, newOperandReg(), ))

	.copyTo(, .c.VRegOf())
}

func ( *machine) ( sseOpcode, , ,  ssa.Value) {
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .c.VRegOf()

	 := .copyToTmp(.reg())

	 := .allocateInstr()
	.asXmmRmR(, , )
	.insert()

	.copyTo(, )
}

func ( *machine) ( sseOpcode, , ,  ssa.Value) {
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .getOperand_Mem_Reg(.c.ValueDefinition())
	 := .c.VRegOf()

	 := .copyToTmp(.reg())

	 := .allocateInstr()
	.asXmmRmR(, , )
	.insert()

	.copyTo(, )
}

func ( *machine) (,  ssa.Value,  ssa.FloatCmpCond,  ssa.Value,  ssa.VecLane) {
	var  sseOpcode
	switch  {
	case ssa.VecLaneF32x4:
		 = sseOpcodeCmpps
	case ssa.VecLaneF64x2:
		 = sseOpcodeCmppd
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}

	,  := .c.ValueDefinition(), .c.ValueDefinition()
	var  cmpPred
	switch  {
	case ssa.FloatCmpCondGreaterThan:
		,  = , 
		 = cmpPredLT_OS
	case ssa.FloatCmpCondGreaterThanOrEqual:
		,  = , 
		 = cmpPredLE_OS
	case ssa.FloatCmpCondEqual:
		 = cmpPredEQ_OQ
	case ssa.FloatCmpCondNotEqual:
		 = cmpPredNEQ_UQ
	case ssa.FloatCmpCondLessThan:
		 = cmpPredLT_OS
	case ssa.FloatCmpCondLessThanOrEqual:
		 = cmpPredLE_OS
	default:
		panic(fmt.Sprintf("invalid float comparison condition: %s", ))
	}

	 := .c.AllocateVReg(ssa.TypeV128)
	 := .getOperand_Mem_Reg()
	.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, , ))

	 := .getOperand_Mem_Reg()
	.insert(.allocateInstr().asXmmRmRImm(, byte(), , ))

	.copyTo(, .c.VRegOf())
}

func ( *machine) (,  ssa.Value,  ssa.IntegerCmpCond,  ssa.Value,  ssa.VecLane) {
	var , , , ,  sseOpcode
	switch  {
	case ssa.VecLaneI8x16:
		, , , ,  = sseOpcodePcmpeqb, sseOpcodePcmpgtb, sseOpcodePmaxub, sseOpcodePminub, sseOpcodePminsb
	case ssa.VecLaneI16x8:
		, , , ,  = sseOpcodePcmpeqw, sseOpcodePcmpgtw, sseOpcodePmaxuw, sseOpcodePminuw, sseOpcodePminsw
	case ssa.VecLaneI32x4:
		, , , ,  = sseOpcodePcmpeqd, sseOpcodePcmpgtd, sseOpcodePmaxud, sseOpcodePminud, sseOpcodePminsd
	case ssa.VecLaneI64x2:
		,  = sseOpcodePcmpeqq, sseOpcodePcmpgtq
	default:
		panic(fmt.Sprintf("invalid lane type: %s", ))
	}

	 := .c.AllocateVReg(ssa.TypeV128)
	var  operand
	switch  {
	case ssa.IntegerCmpCondSignedLessThanOrEqual:
		if  == ssa.VecLaneI64x2 {
			 := .getOperand_Mem_Reg(.c.ValueDefinition())
			// Copy x to tmp.
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, , ))
			 = .getOperand_Mem_Reg(.c.ValueDefinition())
		} else {
			 := .getOperand_Mem_Reg(.c.ValueDefinition())
			// Copy y to tmp.
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, , ))
			 = .getOperand_Mem_Reg(.c.ValueDefinition())
		}
	case ssa.IntegerCmpCondSignedGreaterThanOrEqual:
		if  == ssa.VecLaneI64x2 {
			 := .getOperand_Mem_Reg(.c.ValueDefinition())
			// Copy y to tmp.
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, , ))
			 = .getOperand_Mem_Reg(.c.ValueDefinition())
		} else {
			 := .getOperand_Mem_Reg(.c.ValueDefinition())
			// Copy x to tmp.
			.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, , ))
			 = .getOperand_Mem_Reg(.c.ValueDefinition())
		}
	case ssa.IntegerCmpCondSignedLessThan, ssa.IntegerCmpCondUnsignedLessThan, ssa.IntegerCmpCondUnsignedLessThanOrEqual:
		 := .getOperand_Mem_Reg(.c.ValueDefinition())
		// Copy y to tmp.
		.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, , ))
		 = .getOperand_Mem_Reg(.c.ValueDefinition())
	default:
		 := .getOperand_Mem_Reg(.c.ValueDefinition())
		// Copy x to tmp.
		.insert(.allocateInstr().asXmmUnaryRmR(sseOpcodeMovdqu, , ))
		 = .getOperand_Mem_Reg(.c.ValueDefinition())
	}

	switch  {
	case ssa.IntegerCmpCondEqual:
		.insert(.allocateInstr().asXmmRmR(, , ))
	case ssa.IntegerCmpCondNotEqual:
		// First we compare for equality.
		.insert(.allocateInstr().asXmmRmR(, , ))
		// Then flip the bits. To do so, we set all bits on tmp2.
		 := .c.AllocateVReg(ssa.TypeV128)
		.insert(.allocateInstr().asDefineUninitializedReg())
		.insert(.allocateInstr().asXmmRmR(, newOperandReg(), ))
		// And then xor with tmp.
		.insert(.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(), ))
	case ssa.IntegerCmpCondSignedGreaterThan, ssa.IntegerCmpCondSignedLessThan:
		.insert(.allocateInstr().asXmmRmR(, , ))
	case ssa.IntegerCmpCondSignedGreaterThanOrEqual, ssa.IntegerCmpCondSignedLessThanOrEqual:
		if  == ssa.VecLaneI64x2 {
			.insert(.allocateInstr().asXmmRmR(, , ))
			// Then flip the bits. To do so, we set all bits on tmp2.
			 := .c.AllocateVReg(ssa.TypeV128)
			.insert(.allocateInstr().asDefineUninitializedReg())
			.insert(.allocateInstr().asXmmRmR(, newOperandReg(), ))
			// And then xor with tmp.
			.insert(.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(), ))
		} else {
			// First take min of x and y.
			.insert(.allocateInstr().asXmmRmR(, , ))
			// Then compare for equality.
			.insert(.allocateInstr().asXmmRmR(, , ))
		}
	case ssa.IntegerCmpCondUnsignedGreaterThan, ssa.IntegerCmpCondUnsignedLessThan:
		// First maxu of x and y.
		.insert(.allocateInstr().asXmmRmR(, , ))
		// Then compare for equality.
		.insert(.allocateInstr().asXmmRmR(, , ))
		// Then flip the bits. To do so, we set all bits on tmp2.
		 := .c.AllocateVReg(ssa.TypeV128)
		.insert(.allocateInstr().asDefineUninitializedReg())
		.insert(.allocateInstr().asXmmRmR(, newOperandReg(), ))
		// And then xor with tmp.
		.insert(.allocateInstr().asXmmRmR(sseOpcodePxor, newOperandReg(), ))
	case ssa.IntegerCmpCondUnsignedGreaterThanOrEqual, ssa.IntegerCmpCondUnsignedLessThanOrEqual:
		.insert(.allocateInstr().asXmmRmR(, , ))
		.insert(.allocateInstr().asXmmRmR(, , ))
	default:
		panic("BUG")
	}

	.copyTo(, .c.VRegOf())
}

func ( *machine) ( *ssa.Instruction,  sseOpcode) {
	,  := .Arg2()
	 := .c.ValueDefinition()
	 := .c.ValueDefinition()
	,  := .getOperand_Reg(), .getOperand_Reg()
	 := .c.VRegOf(.Return())

	 := .copyToTmp(.reg())

	// pandn between rn, rm.
	 := .allocateInstr()
	.asXmmRmR(sseOpcodePandn, , )
	.insert()

	.copyTo(, )
}

func ( *machine) ( *ssa.Instruction) {
	, ,  := .SelectData()
	 := .c.ValueDefinition()
	 := .c.ValueDefinition()
	,  := .getOperand_Reg(), .getOperand_Reg()
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .c.VRegOf(.Return())

	 := .copyToTmp(.reg())
	 := .copyToTmp(.reg())

	// And between c, x (overwrites x).
	 := .allocateInstr()
	.asXmmRmR(sseOpcodePand, , )
	.insert()

	// Andn between y, c (overwrites c).
	 := .allocateInstr()
	.asXmmRmR(sseOpcodePandn, , )
	.insert()

	 := .allocateInstr()
	.asXmmRmR(sseOpcodePor, newOperandReg(), )
	.insert()

	.copyTo(, )
}

func ( *machine) ( *ssa.Instruction) {
	, ,  := .Arg2WithLane()
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .c.VRegOf(.Return())

	var , , , ,  /* shift right logical */ sseOpcode
	var  uint32
	if  == ssa.VecLaneF32x4 {
		, , , , ,  = sseOpcodeMinps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodePsrld, 0xa
	} else {
		, , , , ,  = sseOpcodeMinpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodePsrlq, 0xd
	}

	 := .copyToTmp(.reg())
	 := .copyToTmp(.reg())

	// tmp1=min(rn, rm)
	 := .allocateInstr()
	.asXmmRmR(, , )
	.insert()

	// tmp2=min(rm, rn)
	 := .allocateInstr()
	.asXmmRmR(, , )
	.insert()

	// tmp3:=tmp1=min(rn, rm)
	 := .copyToTmp()

	// tmp1 = -0         if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
	//       NaN         if rn == NaN || rm == NaN
	//       min(rm, rm) otherwise
	 := .allocateInstr()
	.asXmmRmR(, newOperandReg(), )
	.insert()

	// tmp3 is originally min(rn,rm).
	// tmp3 = 0^ (set all bits) if rn == NaN || rm == NaN
	//        0 otherwise
	 := .allocateInstr()
	.asXmmRmRImm(, uint8(cmpPredUNORD_Q), newOperandReg(), )
	.insert()

	// tmp1 = -0          if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
	//        ^0          if rn == NaN || rm == NaN
	//        min(v1, v2) otherwise
	 := .allocateInstr()
	.asXmmRmR(, newOperandReg(), )
	.insert()

	// tmp3 = set all bits on the mantissa bits
	//        0 otherwise
	 := .allocateInstr()
	.asXmmRmiReg(, newOperandImm32(), )
	.insert()

	// tmp3 = tmp1 and !tmp3
	//     = -0                                                   if (rn == -0 || rm == -0) && rn != NaN && rm !=NaN
	//       set all bits on exponential and sign bit (== NaN)    if rn == NaN || rm == NaN
	//       min(rn, rm)                                          otherwise
	 := .allocateInstr()
	.asXmmRmR(, newOperandReg(), )
	.insert()

	.copyTo(, )
}

func ( *machine) ( *ssa.Instruction) {
	, ,  := .Arg2WithLane()
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .getOperand_Reg(.c.ValueDefinition())
	 := .c.VRegOf(.Return())

	var , , , , , ,  /* shift right logical */ sseOpcode
	var  uint32
	if  == ssa.VecLaneF32x4 {
		, , , , , , ,  = sseOpcodeMaxps, sseOpcodeCmpps, sseOpcodeAndnps, sseOpcodeOrps, sseOpcodeXorps, sseOpcodeSubps, sseOpcodePsrld, 0xa
	} else {
		, , , , , , ,  = sseOpcodeMaxpd, sseOpcodeCmppd, sseOpcodeAndnpd, sseOpcodeOrpd, sseOpcodeXorpd, sseOpcodeSubpd, sseOpcodePsrlq, 0xd
	}

	 := .copyToTmp(.reg())
	 := .copyToTmp(.reg())

	// tmp0=max(rn, rm)
	 := .allocateInstr()
	.asXmmRmR(, , )
	.insert()

	// tmp1=max(rm, rn)
	 := .allocateInstr()
	.asXmmRmR(, , )
	.insert()

	// tmp2=max(rm, rn)
	 := .copyToTmp()

	// tmp2 = -0       if (rn == -0 && rm == 0) || (rn == 0 && rm == -0)
	//         0       if (rn == 0 && rm ==  0)
	//        -0       if (rn == -0 && rm == -0)
	//       v1^v2     if rn == NaN || rm == NaN
	//         0       otherwise
	 := .allocateInstr()
	.asXmmRmR(, newOperandReg(), )
	.insert()
	// tmp1 = -0           if (rn == -0 && rm == 0) || (rn == 0 && rm == -0)
	//         0           if (rn == 0 && rm ==  0)
	//        -0           if (rn == -0 && rm == -0)
	//        NaN          if rn == NaN || rm == NaN
	//        max(v1, v2)  otherwise
	 := .allocateInstr()
	.asXmmRmR(, newOperandReg(), )
	.insert()

	 := .copyToTmp()

	// tmp3 = 0           if (rn == -0 && rm == 0) || (rn == 0 && rm == -0) || (rn == 0 && rm ==  0)
	//       -0           if (rn == -0 && rm == -0)
	//       NaN          if rn == NaN || rm == NaN
	//       max(v1, v2)  otherwise
	//
	// Note: -0 - (-0) = 0 (!= -0) in floating point operation.
	 := .allocateInstr()
	.asXmmRmR(, newOperandReg(), )
	.insert()

	// tmp1 = 0^ if rn == NaN || rm == NaN
	 := .allocateInstr()
	.asXmmRmRImm(, uint8(cmpPredUNORD_Q), newOperandReg(), )
	.insert()

	// tmp1 = set all bits on the mantissa bits
	//        0 otherwise
	 := .allocateInstr()
	.asXmmRmiReg(, newOperandImm32(), )
	.insert()

	 := .allocateInstr()
	.asXmmRmR(, newOperandReg(), )
	.insert()

	.copyTo(, )
}

func ( *machine) ( *ssa.Instruction) {
	,  := .ArgWithLane()
	 := .getOperand_Mem_Reg(.c.ValueDefinition())
	 := .c.VRegOf(.Return())

	 := .c.AllocateVReg(ssa.TypeV128)

	 := .allocateInstr()
	.asDefineUninitializedReg()
	.insert()

	// Set all bits on tmp.
	 := .allocateInstr()
	.asXmmRmR(sseOpcodePcmpeqd, newOperandReg(), )
	.insert()

	switch  {
	case ssa.VecLaneF32x4:
		// Shift right packed single floats by 1 to clear the sign bits.
		 := .allocateInstr()
		.asXmmRmiReg(sseOpcodePsrld, newOperandImm32(1), )
		.insert()
		// Clear the sign bit of rm.
		 := .allocateInstr()
		.asXmmRmR(sseOpcodeAndpd, , )
		.insert()
	case ssa.VecLaneF64x2:
		// Shift right packed single floats by 1 to clear the sign bits.
		 := .allocateInstr()
		.asXmmRmiReg(sseOpcodePsrlq, newOperandImm32(1), )
		.insert()
		// Clear the sign bit of rm.
		 := .allocateInstr()
		.asXmmRmR(sseOpcodeAndps, , )
		.insert()
	}

	.copyTo(, )
}