galoisAvx512_amd64.go 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. //+build !noasm
  2. //+build !appengine
  3. //+build !gccgo
  4. // Copyright 2015, Klaus Post, see LICENSE for details.
  5. // Copyright 2019, Minio, Inc.
  6. package reedsolomon
  7. //go:noescape
  8. func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
  9. //go:noescape
  10. func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool)
  11. func init() {
  12. amd64 = true
  13. }
  14. const (
  15. dimIn = 8 // Number of input rows processed simultaneously
  16. dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine
  17. dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine
  18. matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine
  19. matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine
  20. )
  21. // Construct block of matrix coefficients for 2 outputs rows in parallel
  22. func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) {
  23. offset := 0
  24. for c := inputOffset; c < inputOffset+dimIn; c++ {
  25. for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ {
  26. if c < len(matrixRows[iRow]) {
  27. coeff := matrixRows[iRow][c]
  28. copy(matrix[offset*32:], mulTableLow[coeff][:])
  29. copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
  30. } else {
  31. // coefficients not used for this input shard (so null out)
  32. v := matrix[offset*32 : offset*32+32]
  33. for i := range v {
  34. v[i] = 0
  35. }
  36. }
  37. offset += dimIn
  38. if offset >= dimIn*dimOut82 {
  39. offset -= dimIn*dimOut82 - 1
  40. }
  41. }
  42. }
  43. }
  44. // Construct block of matrix coefficients for 4 outputs rows in parallel
  45. func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) {
  46. offset := 0
  47. for c := inputOffset; c < inputOffset+dimIn; c++ {
  48. for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ {
  49. if c < len(matrixRows[iRow]) {
  50. coeff := matrixRows[iRow][c]
  51. copy(matrix[offset*32:], mulTableLow[coeff][:])
  52. copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
  53. } else {
  54. // coefficients not used for this input shard (so null out)
  55. v := matrix[offset*32 : offset*32+32]
  56. for i := range v {
  57. v[i] = 0
  58. }
  59. }
  60. offset += dimIn
  61. if offset >= dimIn*dimOut84 {
  62. offset -= dimIn*dimOut84 - 1
  63. }
  64. }
  65. }
  66. }
  67. // Invoke AVX512 routine for 2 output rows in parallel
  68. func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset int) {
  69. done := len(in[0])
  70. if done == 0 {
  71. return
  72. }
  73. inputEnd := inputOffset + dimIn
  74. if inputEnd > len(in) {
  75. inputEnd = len(in)
  76. }
  77. outputEnd := outputOffset + dimOut82
  78. if outputEnd > len(out) {
  79. outputEnd = len(out)
  80. }
  81. matrix82 := [matrixSize82]byte{}
  82. setupMatrix82(matrixRows, inputOffset, outputOffset, &matrix82)
  83. addTo := inputOffset != 0 // Except for the first input column, add to previous results
  84. _galMulAVX512Parallel82(in[inputOffset:inputEnd], out[outputOffset:outputEnd], &matrix82, addTo)
  85. done = (done >> 6) << 6
  86. if len(in[0])-done == 0 {
  87. return
  88. }
  89. for c := inputOffset; c < inputOffset+dimIn; c++ {
  90. for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ {
  91. if c < len(matrixRows[iRow]) {
  92. mt := mulTable[matrixRows[iRow][c]][:256]
  93. for i := done; i < len(in[0]); i++ {
  94. if c == 0 { // only set value for first input column
  95. out[iRow][i] = mt[in[c][i]]
  96. } else { // and add for all others
  97. out[iRow][i] ^= mt[in[c][i]]
  98. }
  99. }
  100. }
  101. }
  102. }
  103. }
  104. // Invoke AVX512 routine for 4 output rows in parallel
  105. func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset int) {
  106. done := len(in[0])
  107. if done == 0 {
  108. return
  109. }
  110. inputEnd := inputOffset + dimIn
  111. if inputEnd > len(in) {
  112. inputEnd = len(in)
  113. }
  114. outputEnd := outputOffset + dimOut84
  115. if outputEnd > len(out) {
  116. outputEnd = len(out)
  117. }
  118. matrix84 := [matrixSize84]byte{}
  119. setupMatrix84(matrixRows, inputOffset, outputOffset, &matrix84)
  120. addTo := inputOffset != 0 // Except for the first input column, add to previous results
  121. _galMulAVX512Parallel84(in[inputOffset:inputEnd], out[outputOffset:outputEnd], &matrix84, addTo)
  122. done = (done >> 6) << 6
  123. if len(in[0])-done == 0 {
  124. return
  125. }
  126. for c := inputOffset; c < inputOffset+dimIn; c++ {
  127. for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ {
  128. if c < len(matrixRows[iRow]) {
  129. mt := mulTable[matrixRows[iRow][c]][:256]
  130. for i := done; i < len(in[0]); i++ {
  131. if c == 0 { // only set value for first input column
  132. out[iRow][i] = mt[in[c][i]]
  133. } else { // and add for all others
  134. out[iRow][i] ^= mt[in[c][i]]
  135. }
  136. }
  137. }
  138. }
  139. }
  140. }
  141. // Perform the same as codeSomeShards, but taking advantage of
  142. // AVX512 parallelism for up to 4x faster execution as compared to AVX2
  143. func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
  144. outputRow := 0
  145. // First process (multiple) batches of 4 output rows in parallel
  146. for ; outputRow+dimOut84 <= len(outputs); outputRow += dimOut84 {
  147. for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
  148. galMulAVX512Parallel84(inputs, outputs, matrixRows, inputRow, outputRow)
  149. }
  150. }
  151. // Then process a (single) batch of 2 output rows in parallel
  152. if outputRow+dimOut82 <= len(outputs) {
  153. // fmt.Println(outputRow, len(outputs))
  154. for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
  155. galMulAVX512Parallel82(inputs, outputs, matrixRows, inputRow, outputRow)
  156. }
  157. outputRow += dimOut82
  158. }
  159. // Lastly, we may have a single output row left (for uneven parity)
  160. if outputRow < len(outputs) {
  161. for c := 0; c < r.DataShards; c++ {
  162. if c == 0 {
  163. galMulSlice(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
  164. } else {
  165. galMulSliceXor(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
  166. }
  167. }
  168. }
  169. }