//go:build !noasm && gc && arm64 && !amd64

#include "textflag.h"

#define RoundConst0 $1518500249 // 0x5A827999
#define RoundConst1 $1859775393 // 0x6ED9EBA1
#define RoundConst2 $2400959708 // 0x8F1BBCDC
#define RoundConst3 $3395469782 // 0xCA62C1D6

// FUNC1 f = (b & c) | ((~b) & d)
#define FUNC1(b, c, d) \
	MOVW d, R15; \
	EORW c, R15; \
	ANDW b, R15; \
	EORW d, R15

// FUNC2 f = b ^ c ^ d
#define FUNC2(b, c, d) \
	MOVW b, R15; \
	EORW c, R15; \
	EORW d, R15

// FUNC3 f = (b & c) | (b & d) | (c & d)
#define FUNC3(b, c, d) \
	MOVW b, R27; \
	ORR c, R27, R27; \
	ANDW d, R27, R27; \
	MOVW b, R15; \
	ANDW c, R15, R15; \
	ORR R27, R15, R15

#define FUNC4(b, c, d) FUNC2(b, c, d)
	
#define MIX(a, b, c, d, e, k) \
	RORW $2, b, b; \
	ADDW R15, e, e; \
	MOVW a, R27; \
	RORW $27, R27, R27; \
	MOVW k, R19; \
	ADDW R19, e, e; \
	ADDW R9, e, e; \
	ADDW R27, e, e

#define LOAD(index) \
	MOVWU (index*4)(R16), R9; \
	REVW R9, R9; \
	MOVW R9, (index*4)(RSP)

#define LOADCS(a, b, c, d, e, index) \
	MOVD cs_base+56(FP), R27; \
	MOVW a, ((index*20))(R27); \
	MOVW b, ((index*20)+4)(R27); \
	MOVW c, ((index*20)+8)(R27); \
	MOVW d, ((index*20)+12)(R27); \
	MOVW e, ((index*20)+16)(R27)

#define SHUFFLE(index) \
	MOVW ((index&0xf)*4)(RSP), R9; \
	MOVW (((index-3)&0xf)*4)(RSP), R20; \
	EORW R20, R9; \
	MOVW (((index-8)&0xf)*4)(RSP), R20; \
	EORW R20, R9; \
	MOVW (((index-14)&0xf)*4)(RSP), R20; \
	EORW R20, R9; \
	RORW $31, R9, R9; \
	MOVW R9, ((index&0xf)*4)(RSP)

// LOADM1 stores message word to m1 array.
#define LOADM1(index) \
	MOVD m1_base+32(FP), R27; \
	MOVW ((index&0xf)*4)(RSP), R9; \
	MOVW R9, (index*4)(R27)

#define ROUND1(a, b, c, d, e, index) \
	LOAD(index); \
	FUNC1(b, c, d); \
	MIX(a, b, c, d, e, RoundConst0); \
	LOADM1(index)

#define ROUND1x(a, b, c, d, e, index) \
	SHUFFLE(index); \
	FUNC1(b, c, d); \
	MIX(a, b, c, d, e, RoundConst0); \
	LOADM1(index)

#define ROUND2(a, b, c, d, e, index) \
	SHUFFLE(index); \
	FUNC2(b, c, d); \
	MIX(a, b, c, d, e, RoundConst1); \
	LOADM1(index)

#define ROUND3(a, b, c, d, e, index) \
	SHUFFLE(index); \
	FUNC3(b, c, d); \
	MIX(a, b, c, d, e, RoundConst2); \
	LOADM1(index)

#define ROUND4(a, b, c, d, e, index) \
	SHUFFLE(index); \
	FUNC4(b, c, d); \
	MIX(a, b, c, d, e, RoundConst3); \
	LOADM1(index)

// func blockARM64(dig *digest, p []byte, m1 []uint32, cs [][5]uint32)
TEXT ·blockARM64(SB), NOSPLIT, $64-80
    MOVD    dig+0(FP), R8
    MOVD    p_base+8(FP), R16
    MOVD    p_len+16(FP), R10

    LSR     $6, R10, R10
    LSL     $6, R10, R10
    ADD     R16, R10, R21

    // Load h0-h4 into R1–R5.
    MOVW    (R8), R1                   // R1 = h0
    MOVW    4(R8), R2                  // R2 = h1
    MOVW    8(R8), R3                  // R3 = h2
    MOVW    12(R8), R4                 // R4 = h3
    MOVW    16(R8), R5                 // R5 = h4

loop:
    // len(p) >= chunk
    CMP     R16, R21
    BLS     end

	// Initialize registers a, b, c, d, e.
	MOVW R1, R10
	MOVW R2, R11
	MOVW R3, R12
	MOVW R4, R13
	MOVW R5, R14

	// ROUND1 (steps 0-15)
	LOADCS(R10, R11, R12, R13, R14, 0)
	ROUND1(R10, R11, R12, R13, R14, 0)
	ROUND1(R14, R10, R11, R12, R13, 1)
	ROUND1(R13, R14, R10, R11, R12, 2)
	ROUND1(R12, R13, R14, R10, R11, 3)
	ROUND1(R11, R12, R13, R14, R10, 4)
	ROUND1(R10, R11, R12, R13, R14, 5)
	ROUND1(R14, R10, R11, R12, R13, 6)
	ROUND1(R13, R14, R10, R11, R12, 7)
	ROUND1(R12, R13, R14, R10, R11, 8)
	ROUND1(R11, R12, R13, R14, R10, 9)
	ROUND1(R10, R11, R12, R13, R14, 10)
	ROUND1(R14, R10, R11, R12, R13, 11)
	ROUND1(R13, R14, R10, R11, R12, 12)
	ROUND1(R12, R13, R14, R10, R11, 13)
	ROUND1(R11, R12, R13, R14, R10, 14)
	ROUND1(R10, R11, R12, R13, R14, 15)

	// ROUND1x (steps 16-19) - same as ROUND1 but with no data load.
	ROUND1x(R14, R10, R11, R12, R13, 16)
	ROUND1x(R13, R14, R10, R11, R12, 17)
	ROUND1x(R12, R13, R14, R10, R11, 18)
	ROUND1x(R11, R12, R13, R14, R10, 19)

	// ROUND2 (steps 20-39)
	ROUND2(R10, R11, R12, R13, R14, 20)
	ROUND2(R14, R10, R11, R12, R13, 21)
	ROUND2(R13, R14, R10, R11, R12, 22)
	ROUND2(R12, R13, R14, R10, R11, 23)
	ROUND2(R11, R12, R13, R14, R10, 24)
	ROUND2(R10, R11, R12, R13, R14, 25)
	ROUND2(R14, R10, R11, R12, R13, 26)
	ROUND2(R13, R14, R10, R11, R12, 27)
	ROUND2(R12, R13, R14, R10, R11, 28)
	ROUND2(R11, R12, R13, R14, R10, 29)
	ROUND2(R10, R11, R12, R13, R14, 30)
	ROUND2(R14, R10, R11, R12, R13, 31)
	ROUND2(R13, R14, R10, R11, R12, 32)
	ROUND2(R12, R13, R14, R10, R11, 33)
	ROUND2(R11, R12, R13, R14, R10, 34)
	ROUND2(R10, R11, R12, R13, R14, 35)
	ROUND2(R14, R10, R11, R12, R13, 36)
	ROUND2(R13, R14, R10, R11, R12, 37)
	ROUND2(R12, R13, R14, R10, R11, 38)
	ROUND2(R11, R12, R13, R14, R10, 39)

	// ROUND3 (steps 40-59)
	ROUND3(R10, R11, R12, R13, R14, 40)
	ROUND3(R14, R10, R11, R12, R13, 41)
	ROUND3(R13, R14, R10, R11, R12, 42)
	ROUND3(R12, R13, R14, R10, R11, 43)
	ROUND3(R11, R12, R13, R14, R10, 44)
	ROUND3(R10, R11, R12, R13, R14, 45)
	ROUND3(R14, R10, R11, R12, R13, 46)
	ROUND3(R13, R14, R10, R11, R12, 47)
	ROUND3(R12, R13, R14, R10, R11, 48)
	ROUND3(R11, R12, R13, R14, R10, 49)
	ROUND3(R10, R11, R12, R13, R14, 50)
	ROUND3(R14, R10, R11, R12, R13, 51)
	ROUND3(R13, R14, R10, R11, R12, 52)
	ROUND3(R12, R13, R14, R10, R11, 53)
	ROUND3(R11, R12, R13, R14, R10, 54)
	ROUND3(R10, R11, R12, R13, R14, 55)
	ROUND3(R14, R10, R11, R12, R13, 56)
	ROUND3(R13, R14, R10, R11, R12, 57)

	LOADCS(R12, R13, R14, R10, R11, 1)
	ROUND3(R12, R13, R14, R10, R11, 58)
	ROUND3(R11, R12, R13, R14, R10, 59)

	// ROUND4 (steps 60-79)
	ROUND4(R10, R11, R12, R13, R14, 60)
	ROUND4(R14, R10, R11, R12, R13, 61)
	ROUND4(R13, R14, R10, R11, R12, 62)
	ROUND4(R12, R13, R14, R10, R11, 63)
	ROUND4(R11, R12, R13, R14, R10, 64)

	LOADCS(R10, R11, R12, R13, R14, 2)
	ROUND4(R10, R11, R12, R13, R14, 65)
	ROUND4(R14, R10, R11, R12, R13, 66)
	ROUND4(R13, R14, R10, R11, R12, 67)
	ROUND4(R12, R13, R14, R10, R11, 68)
	ROUND4(R11, R12, R13, R14, R10, 69)
	ROUND4(R10, R11, R12, R13, R14, 70)
	ROUND4(R14, R10, R11, R12, R13, 71)
	ROUND4(R13, R14, R10, R11, R12, 72)
	ROUND4(R12, R13, R14, R10, R11, 73)
	ROUND4(R11, R12, R13, R14, R10, 74)
	ROUND4(R10, R11, R12, R13, R14, 75)
	ROUND4(R14, R10, R11, R12, R13, 76)
	ROUND4(R13, R14, R10, R11, R12, 77)
	ROUND4(R12, R13, R14, R10, R11, 78)
	ROUND4(R11, R12, R13, R14, R10, 79)

	// Add registers to temp hash.
	ADDW R10, R1, R1
	ADDW R11, R2, R2
	ADDW R12, R3, R3
	ADDW R13, R4, R4
	ADDW R14, R5, R5

	ADD  $64, R16, R16
	B  loop

end:
	MOVW R1, (R8)
	MOVW R2, 4(R8)
	MOVW R3, 8(R8)
	MOVW R4, 12(R8)
	MOVW R5, 16(R8)
	RET