| |
| |
| |
| |
| |
| |
| |
| |
| |
| #include <linux/linkage.h> |
| |
| <------>.text |
| <------>.fpu neon |
| |
| <------> |
| <------>STATE .req r0 |
| <------>BLOCK .req r1 |
| <------>NBLOCKS .req r2 |
| <------>INC .req r3 |
| |
| <------> |
| <------>ROR24_TABLE .req r4 |
| <------>ROR16_TABLE .req r5 |
| |
| <------> |
| <------>ORIG_SP .req r6 |
| |
| <------> |
| <------> |
| <------>M_0 .req d16 |
| <------>M_1 .req d17 |
| <------>M_2 .req d18 |
| <------>M_3 .req d19 |
| <------>M_4 .req d20 |
| <------>M_5 .req d21 |
| <------>M_6 .req d22 |
| <------>M_7 .req d23 |
| <------>M_8 .req d24 |
| <------>M_9 .req d25 |
| <------>M_10 .req d26 |
| <------>M_11 .req d27 |
| <------>M_12 .req d28 |
| <------>M_13 .req d29 |
| <------>M_14 .req d30 |
| <------>M_15 .req d31 |
| |
| <------>.align 4 |
| <------> |
| <------> |
| <------> |
| <------> |
| .Lror24_table: |
| <------>.byte 3, 4, 5, 6, 7, 0, 1, 2 |
| .Lror16_table: |
| <------>.byte 2, 3, 4, 5, 6, 7, 0, 1 |
| <------> |
| .Lblake2b_IV: |
| <------>.quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b |
| <------>.quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1 |
| <------>.quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f |
| <------>.quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179 |
| |
| |
| |
| |
| |
| |
| |
| .macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \ |
| <------><------><------>s8, s9, s10, s11, s12, s13, s14, s15, final=0 |
| |
| <------> |
| <------> |
| <------> |
| |
| <------> |
| <------>vadd.u64 q0, q0, q2 |
| <------>vadd.u64 q1, q1, q3 |
| <------>vadd.u64 d0, d0, M_\s0 |
| <------>vadd.u64 d1, d1, M_\s2 |
| <------>vadd.u64 d2, d2, M_\s4 |
| <------>vadd.u64 d3, d3, M_\s6 |
| |
| <------> |
| <------>veor q6, q6, q0 |
| <------>veor q7, q7, q1 |
| <------>vrev64.32 q6, q6 |
| <------>vrev64.32 q7, q7 |
| |
| <------> |
| <------>vadd.u64 q4, q4, q6 |
| <------>vadd.u64 q5, q5, q7 |
| |
| <------> |
| <------>vld1.8 {M_0}, [ROR24_TABLE, :64] |
| <------>veor q2, q2, q4 |
| <------>veor q3, q3, q5 |
| <------>vtbl.8 d4, {d4}, M_0 |
| <------>vtbl.8 d5, {d5}, M_0 |
| <------>vtbl.8 d6, {d6}, M_0 |
| <------>vtbl.8 d7, {d7}, M_0 |
| |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| .if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0 |
| <------>vld1.8 {M_0}, [sp, :64] |
| .endif |
| <------>vadd.u64 q0, q0, q2 |
| <------>vadd.u64 q1, q1, q3 |
| <------>vadd.u64 d0, d0, M_\s1 |
| <------>vadd.u64 d1, d1, M_\s3 |
| <------>vadd.u64 d2, d2, M_\s5 |
| <------>vadd.u64 d3, d3, M_\s7 |
| |
| <------> |
| <------>vld1.8 {M_0}, [ROR16_TABLE, :64] |
| <------>veor q6, q6, q0 |
| <------>veor q7, q7, q1 |
| <------>vtbl.8 d12, {d12}, M_0 |
| <------>vtbl.8 d13, {d13}, M_0 |
| <------>vtbl.8 d14, {d14}, M_0 |
| <------>vtbl.8 d15, {d15}, M_0 |
| |
| <------> |
| <------>vadd.u64 q4, q4, q6 |
| <------>vadd.u64 q5, q5, q7 |
| |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| <------>veor q8, q2, q4 |
| <------>veor q9, q3, q5 |
| <------>vshr.u64 q2, q8, #63 |
| <------>vshr.u64 q3, q9, #63 |
| <------>vsli.u64 q2, q8, #1 |
| <------>vsli.u64 q3, q9, #1 |
| <------>vld1.8 {q8-q9}, [sp, :256] |
| |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| |
| <------> |
| <------>vadd.u64 d0, d0, d5 |
| <------>vadd.u64 d1, d1, d6 |
| <------>vadd.u64 d2, d2, d7 |
| <------>vadd.u64 d3, d3, d4 |
| <------>vadd.u64 d0, d0, M_\s8 |
| <------>vadd.u64 d1, d1, M_\s10 |
| <------>vadd.u64 d2, d2, M_\s12 |
| <------>vadd.u64 d3, d3, M_\s14 |
| |
| <------> |
| <------>veor d15, d15, d0 |
| <------>veor d12, d12, d1 |
| <------>veor d13, d13, d2 |
| <------>veor d14, d14, d3 |
| <------>vrev64.32 d15, d15 |
| <------>vrev64.32 d12, d12 |
| <------>vrev64.32 d13, d13 |
| <------>vrev64.32 d14, d14 |
| |
| <------> |
| <------>vadd.u64 d10, d10, d15 |
| <------>vadd.u64 d11, d11, d12 |
| <------>vadd.u64 d8, d8, d13 |
| <------>vadd.u64 d9, d9, d14 |
| |
| <------> |
| <------>vld1.8 {M_0}, [ROR24_TABLE, :64] |
| <------>veor d5, d5, d10 |
| <------>veor d6, d6, d11 |
| <------>veor d7, d7, d8 |
| <------>veor d4, d4, d9 |
| <------>vtbl.8 d5, {d5}, M_0 |
| <------>vtbl.8 d6, {d6}, M_0 |
| <------>vtbl.8 d7, {d7}, M_0 |
| <------>vtbl.8 d4, {d4}, M_0 |
| |
| <------> |
| .if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0 |
| <------>vld1.8 {M_0}, [sp, :64] |
| .endif |
| <------>vadd.u64 d0, d0, d5 |
| <------>vadd.u64 d1, d1, d6 |
| <------>vadd.u64 d2, d2, d7 |
| <------>vadd.u64 d3, d3, d4 |
| <------>vadd.u64 d0, d0, M_\s9 |
| <------>vadd.u64 d1, d1, M_\s11 |
| <------>vadd.u64 d2, d2, M_\s13 |
| <------>vadd.u64 d3, d3, M_\s15 |
| |
| <------> |
| <------>vld1.8 {M_0}, [ROR16_TABLE, :64] |
| <------>veor d15, d15, d0 |
| <------>veor d12, d12, d1 |
| <------>veor d13, d13, d2 |
| <------>veor d14, d14, d3 |
| <------>vtbl.8 d12, {d12}, M_0 |
| <------>vtbl.8 d13, {d13}, M_0 |
| <------>vtbl.8 d14, {d14}, M_0 |
| <------>vtbl.8 d15, {d15}, M_0 |
| |
| <------> |
| <------>vadd.u64 d10, d10, d15 |
| <------>vadd.u64 d11, d11, d12 |
| <------>vadd.u64 d8, d8, d13 |
| <------>vadd.u64 d9, d9, d14 |
| |
| <------> |
| <------>veor d16, d4, d9 |
| <------>veor d17, d5, d10 |
| <------>veor d18, d6, d11 |
| <------>veor d19, d7, d8 |
| <------>vshr.u64 q2, q8, #63 |
| <------>vshr.u64 q3, q9, #63 |
| <------>vsli.u64 q2, q8, #1 |
| <------>vsli.u64 q3, q9, #1 |
| <------> |
| .if ! \final |
| <------>vld1.8 {q8-q9}, [sp, :256] |
| .endif |
| .endm |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <------>.align 5 |
| ENTRY(blake2b_compress_neon) |
| <------>push {r4-r10} |
| |
| <------> |
| <------>mov ORIG_SP, sp |
| <------>sub ip, sp, #32 |
| <------>bic ip, ip, #31 |
| <------>mov sp, ip |
| |
| <------>adr ROR24_TABLE, .Lror24_table |
| <------>adr ROR16_TABLE, .Lror16_table |
| |
| <------>mov ip, STATE |
| <------>vld1.64 {q0-q1}, [ip]! |
| <------>vld1.64 {q2-q3}, [ip]! |
| .Lnext_block: |
| <------> adr r10, .Lblake2b_IV |
| <------>vld1.64 {q14-q15}, [ip] |
| <------>vld1.64 {q4-q5}, [r10]! |
| <------> vmov r7, r8, d28 |
| <------>vld1.64 {q6-q7}, [r10] |
| <------> adds r7, r7, INC |
| <------>bcs .Lslow_inc_ctr |
| <------>vmov.i32 d28[0], r7 |
| <------>vst1.64 {d28}, [ip] |
| .Linc_ctr_done: |
| |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| <------>vld1.8 {q8-q9}, [BLOCK]! |
| <------> veor q6, q6, q14 |
| <------>vld1.8 {q10-q11}, [BLOCK]! |
| <------> veor q7, q7, q15 |
| <------>vld1.8 {q12-q13}, [BLOCK]! |
| <------>vst1.8 {q8-q9}, [sp, :256] |
| <------> mov ip, STATE |
| <------>vld1.8 {q14-q15}, [BLOCK]! |
| |
| <------> |
| <------> |
| <------>_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| <------>_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 |
| <------>_blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 |
| <------>_blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 |
| <------>_blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 |
| <------>_blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 |
| <------>_blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 |
| <------>_blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 |
| <------>_blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 |
| <------>_blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 |
| <------>_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| <------>_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \ |
| <------><------><------>final=1 |
| |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> |
| <------> vld1.64 {q8-q9}, [ip]! |
| <------>veor q0, q0, q4 |
| <------>veor q1, q1, q5 |
| <------> vld1.64 {q10-q11}, [ip] |
| <------>veor q2, q2, q6 |
| <------>veor q3, q3, q7 |
| <------>veor q0, q0, q8 |
| <------>veor q1, q1, q9 |
| <------> mov ip, STATE |
| <------> subs NBLOCKS, NBLOCKS, #1 |
| <------> vst1.64 {q0-q1}, [ip]! |
| <------>veor q2, q2, q10 |
| <------>veor q3, q3, q11 |
| <------> vst1.64 {q2-q3}, [ip]! |
| |
| <------> |
| <------>bne .Lnext_block |
| |
| <------>mov sp, ORIG_SP |
| <------>pop {r4-r10} |
| <------>mov pc, lr |
| |
| .Lslow_inc_ctr: |
| <------> |
| <------> |
| <------>vmov r9, r10, d29 |
| <------>adcs r8, r8, #0 |
| <------>adcs r9, r9, #0 |
| <------>adc r10, r10, #0 |
| <------>vmov d28, r7, r8 |
| <------>vmov d29, r9, r10 |
| <------>vst1.64 {q14}, [ip] |
| <------>b .Linc_ctr_done |
| ENDPROC(blake2b_compress_neon) |
| |