aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHarm Hanemaaijer <fgenfb@yahoo.com>2013-07-12 02:13:14 +0200
committerMister Oyster <oysterized@gmail.com>2017-04-11 10:57:09 +0200
commit3edd7a2c542d2489ad734023966721ab87e7109e (patch)
treeee2fbc63f5d2bb48ff135b8a1713ddfe81468cf2
parent34ca1f77d4c63af9cc7083e06427f1adaaf768b7 (diff)
Optimize copy_page for modern ARM platforms
The existing implementation of copy_page for ARM appears to be optimized for older platforms. Benchmark testing in a sandbox environment shows suboptimal performance on modern platforms like armv6 and armv7, with speed-ups ranging from 10% (Cortex A8) to 80% (armv6 used in Raspberry Pi) being achievable. This commit optimizes copy_page and introduces the new compile-time constant PREFETCH_DISTANCE, defined in cache.h, which when multiplied by L1_CACHE_BYTES is equal to the offset used for prefetches performed with the PLD instruction. For platforms where L1_CACHE_BYTES is 32 (armv5 and armv6), copy_page processes 32 bytes at a time while doing one prefetch per iteration, while for armv7 (with L1_CACHE_BYTES equal to 64), 64 bytes are processed at at time with one prefetch per iteration. When no preload instruction is available (platforms earlier than armv5), no preload instructions are generated and 32 bytes are processed at at time. To facilitate specifying instructions for architectures with no preload instruction, the NO_PLD macro is added to assembler.h, augmenting the PLD macro. Signed-off-by: Harm Hanemaaijer <fgenfb@yahoo.com> Signed-off-by: RyTek <rytek1128@outlook.com>
-rw-r--r--arch/arm/include/asm/assembler.h2
-rw-r--r--arch/arm/include/asm/cache.h24
-rw-r--r--arch/arm/lib/copy_page.S76
3 files changed, 78 insertions, 24 deletions
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 4f5e87568..aa7c4eddc 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -58,8 +58,10 @@
*/
#if __LINUX_ARM_ARCH__ >= 5
#define PLD(code...) code
+#define NO_PLD(code...)
#else
#define PLD(code...)
+#define NO_PLD(code...) code
#endif
/*
diff --git a/arch/arm/include/asm/cache.h b/arch/arm/include/asm/cache.h
index 75fe66bc0..dd25faa1b 100644
--- a/arch/arm/include/asm/cache.h
+++ b/arch/arm/include/asm/cache.h
@@ -8,6 +8,30 @@
#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
/*
+ * Set the prefetch distance in units of L1_CACHE_BYTES based on the
+ * cache line size. The prefetch distance is used by the memcpy,
+ * copy_from_user, copy_to_user versions that are optimized
+ * for ARM v6 and v7 platforms, as well as the copy_page function
+ * on ARM v5, v6 and v7 platforms.
+ */
+
+#if L1_CACHE_BYTES == 64
+/*
+ * This value was calibrated on a Cortex A8-based SOC with a 32-bit
+ * DDR3 interface. Other Cortex cores and architectures may benefit
+ * from a different setting.
+ */
+#define PREFETCH_DISTANCE 3
+#else
+/*
+ * This value was calibrated on the ARM v6-based SOC used in the Raspbery
+ * Pi. Other architectures may benefit from a different setting.
+ */
+#define PREFETCH_DISTANCE 3
+#endif
+
+
+/*
* Memory returned by kmalloc() may be used for DMA, so we must make
* sure that all such allocations are cache aligned. Otherwise,
* unrelated code may cause parts of the buffer to be read into the
diff --git a/arch/arm/lib/copy_page.S b/arch/arm/lib/copy_page.S
index 6ee2f6706..119657d74 100644
--- a/arch/arm/lib/copy_page.S
+++ b/arch/arm/lib/copy_page.S
@@ -8,40 +8,68 @@
* published by the Free Software Foundation.
*
* ASM optimised string functions
+ *
+ * Optimization for modern ARM platforms
+ * Copyright 2013 Harm Hanemaaijer
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/asm-offsets.h>
#include <asm/cache.h>
-#define COPY_COUNT (PAGE_SZ / (2 * L1_CACHE_BYTES) PLD( -1 ))
-
- .text
- .align 5
/*
- * StrongARM optimised copy_page routine
- * now 1.78bytes/cycle, was 1.60 bytes/cycle (50MHz bus -> 89MB/s)
- * Note that we probably achieve closer to the 100MB/s target with
- * the core clock switching.
+ * Notes for armv6/v7:
+ * These architectures do not like paired preloads in a 64-byte loop.
+ * Instead use a 32-byte loop with one preload per loop on armv6
+ * (L1_CACHE_BYTES == 32). On armv7 (L1_CACHE_BYTES == 64), use a
+ * 64-byte loop with one preload per loop. In addition, make sure no
+ * prefetching happens beyond the source region. The prefetch distance,
+ * configured in cache.h, defaults to 3 (96 bytes on armv5/armv6, 192
+ * bytes on armv7). This function translates to 16-bit Thumb2
+ * instructions whenever possible.
+ *
+ * This version should work on older platforms as well and is unlikely
+ * to degrade performance significantly.
*/
+
+#define COPY_COUNT (PAGE_SZ / (L1_CACHE_BYTES))
+
+ .text
+ ARM( .p2align 5 )
+ THUMB( .p2align 2 )
+
ENTRY(copy_page)
- stmfd sp!, {r4, lr} @ 2
+ stmfd sp!, {r4-r8, lr}
PLD( pld [r1, #0] )
PLD( pld [r1, #L1_CACHE_BYTES] )
- mov r2, #COPY_COUNT @ 1
- ldmia r1!, {r3, r4, ip, lr} @ 4+1
-1: PLD( pld [r1, #2 * L1_CACHE_BYTES])
- PLD( pld [r1, #3 * L1_CACHE_BYTES])
+#if PREFETCH_DISTANCE > 2
+ PLD( pld [r1, #2 * L1_CACHE_BYTES] )
+#if PREFETCH_DISTANCE > 3
+ PLD( pld [r1, #3 * L1_CACHE_BYTES] )
+#if PREFETCH_DISTANCE > 4
+ PLD( pld [r1, #4 * L1_CACHE_BYTES] )
+#endif
+#endif
+#endif
+ PLD( movs r2, #(COPY_COUNT - PREFETCH_DISTANCE) )
+ NO_PLD( mov r2, #COPY_COUNT )
+1: PLD( pld [r1, #PREFETCH_DISTANCE * L1_CACHE_BYTES])
2:
- .rept (2 * L1_CACHE_BYTES / 16 - 1)
- stmia r0!, {r3, r4, ip, lr} @ 4
- ldmia r1!, {r3, r4, ip, lr} @ 4
- .endr
- subs r2, r2, #1 @ 1
- stmia r0!, {r3, r4, ip, lr} @ 4
- ldmgtia r1!, {r3, r4, ip, lr} @ 4
- bgt 1b @ 1
- PLD( ldmeqia r1!, {r3, r4, ip, lr} )
- PLD( beq 2b )
- ldmfd sp!, {r4, pc} @ 3
+#if L1_CACHE_BYTES == 32
+ ldmia r1!, {r3-r6}
+ ldmia r1!, {r7, r8, ip, lr}
+ stmia r0!, {r3-r6}
+ subs r2, r2, #1
+ stmia r0!, {r7, r8, ip, lr}
+#else /* L1_CACHE_BYTES == 64 */
+ ldmia r1!, {r3-r8, ip, lr}
+ stmia r0!, {r3-r8, ip, lr}
+ ldmia r1!, {r3-r8, ip, lr}
+ subs r2, r2, #1
+ stmia r0!, {r3-r8, ip, lr}
+#endif
+ bgt 1b
+ PLD( cmn r2, #PREFETCH_DISTANCE )
+ PLD( bgt 2b )
+ ldmfd sp!, {r4-r8, pc}
ENDPROC(copy_page)