galois_arm64.s 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. //+build !noasm !appengine !gccgo
  2. // Copyright 2015, Klaus Post, see LICENSE for details.
  3. // Copyright 2017, Minio, Inc.
  4. // Use github.com/minio/asm2plan9s on this file to assemble ARM instructions to
  5. // the opcodes of their Plan9 equivalents
  6. // polynomial multiplication
  7. #define POLYNOMIAL_MULTIPLICATION \
  8. WORD $0x0e3ce340 \ // pmull v0.8h,v26.8b,v28.8b
  9. WORD $0x4e3ce346 \ // pmull2 v6.8h,v26.16b,v28.16b
  10. WORD $0x0e3ce36c \ // pmull v12.8h,v27.8b,v28.8b
  11. WORD $0x4e3ce372 // pmull2 v18.8h,v27.16b,v28.16b
  12. // first reduction
  13. #define FIRST_REDUCTION \
  14. WORD $0x0f088402 \ // shrn v2.8b, v0.8h, #8
  15. WORD $0x0f0884c8 \ // shrn v8.8b, v6.8h, #8
  16. WORD $0x0f08858e \ // shrn v14.8b, v12.8h, #8
  17. WORD $0x0f088654 \ // shrn v20.8b, v18.8h, #8
  18. WORD $0x0e22e3c3 \ // pmull v3.8h,v30.8b,v2.8b
  19. WORD $0x0e28e3c9 \ // pmull v9.8h,v30.8b,v8.8b
  20. WORD $0x0e2ee3cf \ // pmull v15.8h,v30.8b,v14.8b
  21. WORD $0x0e34e3d5 \ // pmull v21.8h,v30.8b,v20.8b
  22. WORD $0x6e201c60 \ // eor v0.16b,v3.16b,v0.16b
  23. WORD $0x6e261d26 \ // eor v6.16b,v9.16b,v6.16b
  24. WORD $0x6e2c1dec \ // eor v12.16b,v15.16b,v12.16b
  25. WORD $0x6e321eb2 // eor v18.16b,v21.16b,v18.16b
  26. // second reduction
  27. #define SECOND_REDUCTION \
  28. WORD $0x0f088404 \ // shrn v4.8b, v0.8h, #8
  29. WORD $0x0f0884ca \ // shrn v10.8b, v6.8h, #8
  30. WORD $0x0f088590 \ // shrn v16.8b, v12.8h, #8
  31. WORD $0x0f088656 \ // shrn v22.8b, v18.8h, #8
  32. WORD $0x6e241c44 \ // eor v4.16b,v2.16b,v4.16b
  33. WORD $0x6e2a1d0a \ // eor v10.16b,v8.16b,v10.16b
  34. WORD $0x6e301dd0 \ // eor v16.16b,v14.16b,v16.16b
  35. WORD $0x6e361e96 \ // eor v22.16b,v20.16b,v22.16b
  36. WORD $0x0e24e3c5 \ // pmull v5.8h,v30.8b,v4.8b
  37. WORD $0x0e2ae3cb \ // pmull v11.8h,v30.8b,v10.8b
  38. WORD $0x0e30e3d1 \ // pmull v17.8h,v30.8b,v16.8b
  39. WORD $0x0e36e3d7 \ // pmull v23.8h,v30.8b,v22.8b
  40. WORD $0x6e201ca0 \ // eor v0.16b,v5.16b,v0.16b
  41. WORD $0x6e261d61 \ // eor v1.16b,v11.16b,v6.16b
  42. WORD $0x6e2c1e22 \ // eor v2.16b,v17.16b,v12.16b
  43. WORD $0x6e321ee3 // eor v3.16b,v23.16b,v18.16b
  44. // func galMulNEON(c uint64, in, out []byte)
  45. TEXT ·galMulNEON(SB), 7, $0
  46. MOVD c+0(FP), R0
  47. MOVD in_base+8(FP), R1
  48. MOVD in_len+16(FP), R2 // length of message
  49. MOVD out_base+32(FP), R5
  50. SUBS $32, R2
  51. BMI complete
  52. // Load constants table pointer
  53. MOVD $·constants(SB), R3
  54. // and load constants into v30 & v31
  55. WORD $0x4c40a07e // ld1 {v30.16b-v31.16b}, [x3]
  56. WORD $0x4e010c1c // dup v28.16b, w0
  57. loop:
  58. // Main loop
  59. WORD $0x4cdfa83a // ld1 {v26.4s-v27.4s}, [x1], #32
  60. POLYNOMIAL_MULTIPLICATION
  61. FIRST_REDUCTION
  62. SECOND_REDUCTION
  63. // combine results
  64. WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
  65. WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
  66. // Store result
  67. WORD $0x4c9faca0 // st1 {v0.2d-v1.2d}, [x5], #32
  68. SUBS $32, R2
  69. BPL loop
  70. complete:
  71. RET
  72. // func galMulXorNEON(c uint64, in, out []byte)
  73. TEXT ·galMulXorNEON(SB), 7, $0
  74. MOVD c+0(FP), R0
  75. MOVD in_base+8(FP), R1
  76. MOVD in_len+16(FP), R2 // length of message
  77. MOVD out_base+32(FP), R5
  78. SUBS $32, R2
  79. BMI completeXor
  80. // Load constants table pointer
  81. MOVD $·constants(SB), R3
  82. // and load constants into v30 & v31
  83. WORD $0x4c40a07e // ld1 {v30.16b-v31.16b}, [x3]
  84. WORD $0x4e010c1c // dup v28.16b, w0
  85. loopXor:
  86. // Main loop
  87. WORD $0x4cdfa83a // ld1 {v26.4s-v27.4s}, [x1], #32
  88. WORD $0x4c40a8b8 // ld1 {v24.4s-v25.4s}, [x5]
  89. POLYNOMIAL_MULTIPLICATION
  90. FIRST_REDUCTION
  91. SECOND_REDUCTION
  92. // combine results
  93. WORD $0x4e1f2000 // tbl v0.16b,{v0.16b,v1.16b},v31.16b
  94. WORD $0x4e1f2041 // tbl v1.16b,{v2.16b,v3.16b},v31.16b
  95. // Xor result and store
  96. WORD $0x6e381c00 // eor v0.16b,v0.16b,v24.16b
  97. WORD $0x6e391c21 // eor v1.16b,v1.16b,v25.16b
  98. WORD $0x4c9faca0 // st1 {v0.2d-v1.2d}, [x5], #32
  99. SUBS $32, R2
  100. BPL loopXor
  101. completeXor:
  102. RET
  103. // Constants table
  104. // generating polynomial is 29 (= 0x1d)
  105. DATA ·constants+0x0(SB)/8, $0x1d1d1d1d1d1d1d1d
  106. DATA ·constants+0x8(SB)/8, $0x1d1d1d1d1d1d1d1d
  107. // constant for TBL instruction
  108. DATA ·constants+0x10(SB)/8, $0x0e0c0a0806040200
  109. DATA ·constants+0x18(SB)/8, $0x1e1c1a1816141210
  110. GLOBL ·constants(SB), 8, $32