From c2a78afc15de953f9aa4fa2cd186df0991424f46 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 27 Nov 2018 18:42:55 +0100
Subject: [PATCH] arm64/lib: improve CRC32 performance for deep pipelines

Improve the performance of the crc32() asm routines by getting rid of
most of the branches and small sized loads on the common path.

Instead, use a branchless code path involving overlapping 16 byte
loads to process the first (length % 32) bytes, and process the
remainder using a loop that processes 32 bytes at a time.

Tested using the following test program:

  #include <stdlib.h>

  extern void crc32_le(unsigned short, char const*, int);

  int main(void)
  {
    static const char buf[4096];

    srand(20181126);

    for (int i = 0; i < 100 * 1000 * 1000; i++)
      crc32_le(0, buf, rand() % 1024);

    return 0;
  }

On Cortex-A53 and Cortex-A57, the performance regresses but only very
slightly. On Cortex-A72 however, the performance improves from

  $ time ./crc32

  real  0m10.149s
  user  0m10.149s
  sys   0m0.000s

to

  $ time ./crc32

  real  0m7.915s
  user  0m7.915s
  sys   0m0.000s

Cc: Rui Sun <sunrui26@huawei.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/lib/crc32.S | 54 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/lib/crc32.S b/arch/arm64/lib/crc32.S
index 5bc1e85b4e1c..f132f2a7522e 100644
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@@ -15,15 +15,59 @@
 	.cpu		generic+crc
 
 	.macro		__crc32, c
-0:	subs		x2, x2, #16
-	b.mi		8f
-	ldp		x3, x4, [x1], #16
+	cmp		x2, #16
+	b.lt		8f			// less than 16 bytes
+
+	and		x7, x2, #0x1f
+	and		x2, x2, #~0x1f
+	cbz		x7, 32f			// multiple of 32 bytes
+
+	and		x8, x7, #0xf
+	ldp		x3, x4, [x1]
+	add		x8, x8, x1
+	add		x1, x1, x7
+	ldp		x5, x6, [x8]
 CPU_BE(	rev		x3, x3		)
 CPU_BE(	rev		x4, x4		)
+CPU_BE(	rev		x5, x5		)
+CPU_BE(	rev		x6, x6		)
+
+	tst		x7, #8
+	crc32\c\()x	w8, w0, x3
+	csel		x3, x3, x4, eq
+	csel		w0, w0, w8, eq
+	tst		x7, #4
+	lsr		x4, x3, #32
+	crc32\c\()w	w8, w0, w3
+	csel		x3, x3, x4, eq
+	csel		w0, w0, w8, eq
+	tst		x7, #2
+	lsr		w4, w3, #16
+	crc32\c\()h	w8, w0, w3
+	csel		w3, w3, w4, eq
+	csel		w0, w0, w8, eq
+	tst		x7, #1
+	crc32\c\()b	w8, w0, w3
+	csel		w0, w0, w8, eq
+	tst		x7, #16
+	crc32\c\()x	w8, w0, x5
+	crc32\c\()x	w8, w8, x6
+	csel		w0, w0, w8, eq
+	cbz		x2, 0f
+
+32:	ldp		x3, x4, [x1], #32
+	sub		x2, x2, #32
+	ldp		x5, x6, [x1, #-16]
+CPU_BE(	rev		x3, x3		)
+CPU_BE(	rev		x4, x4		)
+CPU_BE(	rev		x5, x5		)
+CPU_BE(	rev		x6, x6		)
 	crc32\c\()x	w0, w0, x3
 	crc32\c\()x	w0, w0, x4
-	b.ne		0b
-	ret
+	crc32\c\()x	w0, w0, x5
+	crc32\c\()x	w0, w0, x6
+	cbnz		x2, 32b
+0:	ret
 
 8:	tbz		x2, #3, 4f
 	ldr		x3, [x1], #8