Optimize copy_page for modern ARM platforms

The existing implementation of copy_page for ARM appears to be optimized for older platforms. Benchmark testing in a sandbox environment shows suboptimal performance on modern platforms like armv6 and armv7, with speed-ups ranging from 10% (Cortex A8) to 80% (armv6 used in Raspberry Pi) being achievable. This commit optimizes copy_page and introduces the new compile-time constant PREFETCH_DISTANCE, defined in cache.h, which when multiplied by L1_CACHE_BYTES is equal to the offset used for prefetches performed with the PLD instruction. For platforms where L1_CACHE_BYTES is 32 (armv5 and armv6), copy_page processes 32 bytes at a time while doing one prefetch per iteration, while for armv7 (with L1_CACHE_BYTES equal to 64), 64 bytes are processed at at time with one prefetch per iteration. When no preload instruction is available (platforms earlier than armv5), no preload instructions are generated and 32 bytes are processed at at time. To facilitate specifying instructions for architectures with no preload instruction, the NO_PLD macro is added to assembler.h, augmenting the PLD macro. Signed-off-by: Harm Hanemaaijer <fgenfb@yahoo.com> Signed-off-by: RyTek <rytek1128@outlook.com>
author: Harm Hanemaaijer <fgenfb@yahoo.com> 2013-07-12 02:13:14 +0200
committer: Mister Oyster <oysterized@gmail.com> 2017-04-11 10:57:09 +0200
commit: 3edd7a2c542d2489ad734023966721ab87e7109e (patch)
tree: ee2fbc63f5d2bb48ff135b8a1713ddfe81468cf2
parent: 34ca1f77d4c63af9cc7083e06427f1adaaf768b7 (diff)
3 files changed, 78 insertions, 24 deletions
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 4f5e87568..aa7c4eddc 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -58,8 +58,10 @@
  */
 #if __LINUX_ARM_ARCH__ >= 5
 #define PLD(code...)	code
+#define NO_PLD(code...)
 #else
 #define PLD(code...)
+#define NO_PLD(code...) code
 #endif
 
 /*
diff --git a/arch/arm/include/asm/cache.h b/arch/arm/include/asm/cache.h
index 75fe66bc0..dd25faa1b 100644
--- a/arch/arm/include/asm/cache.h
+++ b/arch/arm/include/asm/cache.h
@@ -8,6 +8,30 @@
 #define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 
 /*
+ * Set the prefetch distance in units of L1_CACHE_BYTES based on the
+ * cache line size. The prefetch distance is used by the memcpy,
+ * copy_from_user, copy_to_user versions that are optimized
+ * for ARM v6 and v7 platforms, as well as the copy_page function
+ * on ARM v5, v6 and v7 platforms.
+ */
+
+#if L1_CACHE_BYTES == 64
+/*
+ * This value was calibrated on a Cortex A8-based SOC with a 32-bit
+ * DDR3 interface. Other Cortex cores and architectures may benefit
+ * from a different setting.
+ */
+#define PREFETCH_DISTANCE 3
+#else
+/*
+ * This value was calibrated on the ARM v6-based SOC used in the Raspbery
+ * Pi. Other architectures may benefit from a different setting.
+ */
+#define PREFETCH_DISTANCE 3
+#endif
+
+
+/*
  * Memory returned by kmalloc() may be used for DMA, so we must make
  * sure that all such allocations are cache aligned. Otherwise,
  * unrelated code may cause parts of the buffer to be read into the
diff --git a/arch/arm/lib/copy_page.S b/arch/arm/lib/copy_page.S
index 6ee2f6706..119657d74 100644
--- a/arch/arm/lib/copy_page.S
+++ b/arch/arm/lib/copy_page.S
@@ -8,40 +8,68 @@
  * published by the Free Software Foundation.
  *
  *  ASM optimised string functions
+ *
+ *  Optimization for modern ARM platforms
+ *  Copyright 2013 Harm Hanemaaijer
  */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/asm-offsets.h>
 #include <asm/cache.h>
 
-#define COPY_COUNT (PAGE_SZ / (2 * L1_CACHE_BYTES) PLD( -1 ))
-
-		.text
-		.align	5
 /*
- * StrongARM optimised copy_page routine
- * now 1.78bytes/cycle, was 1.60 bytes/cycle (50MHz bus -> 89MB/s)
- * Note that we probably achieve closer to the 100MB/s target with
- * the core clock switching.
+ * Notes for armv6/v7:
+ * These architectures do not like paired preloads in a 64-byte loop.
+ * Instead use a 32-byte loop with one preload per loop on armv6
+ * (L1_CACHE_BYTES == 32). On armv7 (L1_CACHE_BYTES == 64), use a
+ * 64-byte loop with one preload per loop. In addition, make sure no
+ * prefetching happens beyond the source region. The prefetch distance,
+ * configured in cache.h, defaults to 3 (96 bytes on armv5/armv6, 192
+ * bytes on armv7). This function translates to 16-bit Thumb2
+ * instructions whenever possible.
+ *
+ * This version should work on older platforms as well and is unlikely
+ * to degrade performance significantly.
  */
+
+#define COPY_COUNT (PAGE_SZ / (L1_CACHE_BYTES))
+
+		.text
+	ARM(	.p2align 5	)
+	THUMB(	.p2align 2	)
+
 ENTRY(copy_page)
-		stmfd	sp!, {r4, lr}			@	2
+		stmfd	sp!, {r4-r8, lr}
 	PLD(	pld	[r1, #0]		)
 	PLD(	pld	[r1, #L1_CACHE_BYTES]		)
-		mov	r2, #COPY_COUNT			@	1
-		ldmia	r1!, {r3, r4, ip, lr}		@	4+1
-1:	PLD(	pld	[r1, #2 * L1_CACHE_BYTES])
-	PLD(	pld	[r1, #3 * L1_CACHE_BYTES])
+#if PREFETCH_DISTANCE > 2
+	PLD(	pld	[r1, #2 * L1_CACHE_BYTES]	)
+#if PREFETCH_DISTANCE > 3
+	PLD(	pld	[r1, #3 * L1_CACHE_BYTES]	)
+#if PREFETCH_DISTANCE > 4
+	PLD(	pld	[r1, #4 * L1_CACHE_BYTES]	)
+#endif
+#endif
+#endif
+	PLD(	movs	r2, #(COPY_COUNT - PREFETCH_DISTANCE)	)
+	NO_PLD(	mov	r2, #COPY_COUNT				)
+1:	PLD(	pld	[r1, #PREFETCH_DISTANCE * L1_CACHE_BYTES])
 2:
-	.rept	(2 * L1_CACHE_BYTES / 16 - 1)
-		stmia	r0!, {r3, r4, ip, lr}		@	4
-		ldmia	r1!, {r3, r4, ip, lr}		@	4
-	.endr
-		subs	r2, r2, #1			@	1
-		stmia	r0!, {r3, r4, ip, lr}		@	4
-		ldmgtia	r1!, {r3, r4, ip, lr}		@	4
-		bgt	1b				@	1
-	PLD(	ldmeqia r1!, {r3, r4, ip, lr}	)
-	PLD(	beq	2b			)
-		ldmfd	sp!, {r4, pc}			@	3
+#if L1_CACHE_BYTES == 32
+		ldmia	r1!, {r3-r6}
+		ldmia   r1!, {r7, r8, ip, lr}
+		stmia	r0!, {r3-r6}
+		subs	r2, r2, #1
+		stmia   r0!, {r7, r8, ip, lr}
+#else /* L1_CACHE_BYTES == 64 */
+		ldmia   r1!, {r3-r8, ip, lr}
+		stmia	r0!, {r3-r8, ip, lr}
+		ldmia   r1!, {r3-r8, ip, lr}
+		subs	r2, r2, #1
+		stmia	r0!, {r3-r8, ip, lr}
+#endif
+		bgt	1b
+	PLD(	cmn	r2, #PREFETCH_DISTANCE	)
+	PLD(	bgt	2b			)
+		ldmfd	sp!, {r4-r8, pc}
 ENDPROC(copy_page)
author	Harm Hanemaaijer <fgenfb@yahoo.com>	2013-07-12 02:13:14 +0200
committer	Mister Oyster <oysterized@gmail.com>	2017-04-11 10:57:09 +0200
commit	3edd7a2c542d2489ad734023966721ab87e7109e (patch)
tree	ee2fbc63f5d2bb48ff135b8a1713ddfe81468cf2
parent	34ca1f77d4c63af9cc7083e06427f1adaaf768b7 (diff)