diff options
| author | Harm Hanemaaijer <fgenfb@yahoo.com> | 2013-07-12 02:13:14 +0200 |
|---|---|---|
| committer | Mister Oyster <oysterized@gmail.com> | 2017-04-11 10:57:09 +0200 |
| commit | 3edd7a2c542d2489ad734023966721ab87e7109e (patch) | |
| tree | ee2fbc63f5d2bb48ff135b8a1713ddfe81468cf2 | |
| parent | 34ca1f77d4c63af9cc7083e06427f1adaaf768b7 (diff) | |
Optimize copy_page for modern ARM platforms
The existing implementation of copy_page for ARM appears to be
optimized for older platforms. Benchmark testing in a sandbox
environment shows suboptimal performance on modern platforms
like armv6 and armv7, with speed-ups ranging from 10% (Cortex A8)
to 80% (armv6 used in Raspberry Pi) being achievable.
This commit optimizes copy_page and introduces the new compile-time
constant PREFETCH_DISTANCE, defined in cache.h, which when
multiplied by L1_CACHE_BYTES is equal to the offset used for
prefetches performed with the PLD instruction. For platforms where
L1_CACHE_BYTES is 32 (armv5 and armv6), copy_page processes 32 bytes
at a time while doing one prefetch per iteration, while for armv7
(with L1_CACHE_BYTES equal to 64), 64 bytes are processed at at time
with one prefetch per iteration. When no preload instruction is
available (platforms earlier than armv5), no preload instructions
are generated and 32 bytes are processed at at time.
To facilitate specifying instructions for architectures with no
preload instruction, the NO_PLD macro is added to assembler.h,
augmenting the PLD macro.
Signed-off-by: Harm Hanemaaijer <fgenfb@yahoo.com>
Signed-off-by: RyTek <rytek1128@outlook.com>
| -rw-r--r-- | arch/arm/include/asm/assembler.h | 2 | ||||
| -rw-r--r-- | arch/arm/include/asm/cache.h | 24 | ||||
| -rw-r--r-- | arch/arm/lib/copy_page.S | 76 |
3 files changed, 78 insertions, 24 deletions
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 4f5e87568..aa7c4eddc 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -58,8 +58,10 @@ */ #if __LINUX_ARM_ARCH__ >= 5 #define PLD(code...) code +#define NO_PLD(code...) #else #define PLD(code...) +#define NO_PLD(code...) code #endif /* diff --git a/arch/arm/include/asm/cache.h b/arch/arm/include/asm/cache.h index 75fe66bc0..dd25faa1b 100644 --- a/arch/arm/include/asm/cache.h +++ b/arch/arm/include/asm/cache.h @@ -8,6 +8,30 @@ #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) /* + * Set the prefetch distance in units of L1_CACHE_BYTES based on the + * cache line size. The prefetch distance is used by the memcpy, + * copy_from_user, copy_to_user versions that are optimized + * for ARM v6 and v7 platforms, as well as the copy_page function + * on ARM v5, v6 and v7 platforms. + */ + +#if L1_CACHE_BYTES == 64 +/* + * This value was calibrated on a Cortex A8-based SOC with a 32-bit + * DDR3 interface. Other Cortex cores and architectures may benefit + * from a different setting. + */ +#define PREFETCH_DISTANCE 3 +#else +/* + * This value was calibrated on the ARM v6-based SOC used in the Raspbery + * Pi. Other architectures may benefit from a different setting. + */ +#define PREFETCH_DISTANCE 3 +#endif + + +/* * Memory returned by kmalloc() may be used for DMA, so we must make * sure that all such allocations are cache aligned. Otherwise, * unrelated code may cause parts of the buffer to be read into the diff --git a/arch/arm/lib/copy_page.S b/arch/arm/lib/copy_page.S index 6ee2f6706..119657d74 100644 --- a/arch/arm/lib/copy_page.S +++ b/arch/arm/lib/copy_page.S @@ -8,40 +8,68 @@ * published by the Free Software Foundation. * * ASM optimised string functions + * + * Optimization for modern ARM platforms + * Copyright 2013 Harm Hanemaaijer */ #include <linux/linkage.h> #include <asm/assembler.h> #include <asm/asm-offsets.h> #include <asm/cache.h> -#define COPY_COUNT (PAGE_SZ / (2 * L1_CACHE_BYTES) PLD( -1 )) - - .text - .align 5 /* - * StrongARM optimised copy_page routine - * now 1.78bytes/cycle, was 1.60 bytes/cycle (50MHz bus -> 89MB/s) - * Note that we probably achieve closer to the 100MB/s target with - * the core clock switching. + * Notes for armv6/v7: + * These architectures do not like paired preloads in a 64-byte loop. + * Instead use a 32-byte loop with one preload per loop on armv6 + * (L1_CACHE_BYTES == 32). On armv7 (L1_CACHE_BYTES == 64), use a + * 64-byte loop with one preload per loop. In addition, make sure no + * prefetching happens beyond the source region. The prefetch distance, + * configured in cache.h, defaults to 3 (96 bytes on armv5/armv6, 192 + * bytes on armv7). This function translates to 16-bit Thumb2 + * instructions whenever possible. + * + * This version should work on older platforms as well and is unlikely + * to degrade performance significantly. */ + +#define COPY_COUNT (PAGE_SZ / (L1_CACHE_BYTES)) + + .text + ARM( .p2align 5 ) + THUMB( .p2align 2 ) + ENTRY(copy_page) - stmfd sp!, {r4, lr} @ 2 + stmfd sp!, {r4-r8, lr} PLD( pld [r1, #0] ) PLD( pld [r1, #L1_CACHE_BYTES] ) - mov r2, #COPY_COUNT @ 1 - ldmia r1!, {r3, r4, ip, lr} @ 4+1 -1: PLD( pld [r1, #2 * L1_CACHE_BYTES]) - PLD( pld [r1, #3 * L1_CACHE_BYTES]) +#if PREFETCH_DISTANCE > 2 + PLD( pld [r1, #2 * L1_CACHE_BYTES] ) +#if PREFETCH_DISTANCE > 3 + PLD( pld [r1, #3 * L1_CACHE_BYTES] ) +#if PREFETCH_DISTANCE > 4 + PLD( pld [r1, #4 * L1_CACHE_BYTES] ) +#endif +#endif +#endif + PLD( movs r2, #(COPY_COUNT - PREFETCH_DISTANCE) ) + NO_PLD( mov r2, #COPY_COUNT ) +1: PLD( pld [r1, #PREFETCH_DISTANCE * L1_CACHE_BYTES]) 2: - .rept (2 * L1_CACHE_BYTES / 16 - 1) - stmia r0!, {r3, r4, ip, lr} @ 4 - ldmia r1!, {r3, r4, ip, lr} @ 4 - .endr - subs r2, r2, #1 @ 1 - stmia r0!, {r3, r4, ip, lr} @ 4 - ldmgtia r1!, {r3, r4, ip, lr} @ 4 - bgt 1b @ 1 - PLD( ldmeqia r1!, {r3, r4, ip, lr} ) - PLD( beq 2b ) - ldmfd sp!, {r4, pc} @ 3 +#if L1_CACHE_BYTES == 32 + ldmia r1!, {r3-r6} + ldmia r1!, {r7, r8, ip, lr} + stmia r0!, {r3-r6} + subs r2, r2, #1 + stmia r0!, {r7, r8, ip, lr} +#else /* L1_CACHE_BYTES == 64 */ + ldmia r1!, {r3-r8, ip, lr} + stmia r0!, {r3-r8, ip, lr} + ldmia r1!, {r3-r8, ip, lr} + subs r2, r2, #1 + stmia r0!, {r3-r8, ip, lr} +#endif + bgt 1b + PLD( cmn r2, #PREFETCH_DISTANCE ) + PLD( bgt 2b ) + ldmfd sp!, {r4-r8, pc} ENDPROC(copy_page) |
