diff options
author | Artem Pyanykh <artem.pyanykh@gmail.com> | 2019-04-04 10:43:38 (GMT) |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2019-04-09 14:30:13 (GMT) |
commit | af4cea7f1411e5b99e2417d7c2d3d0e697093103 (patch) | |
tree | ec9ef85347e5c8915e864573997c15aaa8cc5a73 | |
parent | 36d380475d9056fdf93305985be3def00aaf6cf7 (diff) | |
download | ghc-af4cea7f1411e5b99e2417d7c2d3d0e697093103.zip ghc-af4cea7f1411e5b99e2417d7c2d3d0e697093103.tar.gz ghc-af4cea7f1411e5b99e2417d7c2d3d0e697093103.tar.bz2 |
codegen: fix memset unroll for small bytearrays, add 64-bit sets
Fixes #16052
When the offset in `setByteArray#` is statically known, we can provide
better alignment guarantees then just 1 byte.
Also, memset can now do 64-bit wide sets.
The current memset intrinsic is not optimal however and can be
improved for the case when we know that we deal with
(baseAddress at known alignment) + offset
For instance, on 64-bit
`setByteArray# s 1# 23# 0#`
given that bytearray is 8 bytes aligned could be unrolled into
`movb, movw, movl, movq, movq`; but currently it is
`movb x23` since alignment of 1 is all we can embed into MO_Memset op.
-rw-r--r-- | compiler/codeGen/StgCmmPrim.hs | 16 | ||||
-rw-r--r-- | compiler/nativeGen/X86/CodeGen.hs | 80 | ||||
-rw-r--r-- | compiler/utils/Util.hs | 10 | ||||
-rw-r--r-- | testsuite/driver/testlib.py | 59 | ||||
-rw-r--r-- | testsuite/tests/codeGen/should_gen_asm/all.T | 9 | ||||
-rw-r--r-- | testsuite/tests/codeGen/should_gen_asm/bytearray-memset-unroll.asm | 6 | ||||
-rw-r--r-- | testsuite/tests/codeGen/should_gen_asm/bytearray-memset-unroll.hs | 17 |
7 files changed, 162 insertions, 35 deletions
diff --git a/compiler/codeGen/StgCmmPrim.hs b/compiler/codeGen/StgCmmPrim.hs index 4a07c78..1abef3a 100644 --- a/compiler/codeGen/StgCmmPrim.hs +++ b/compiler/codeGen/StgCmmPrim.hs @@ -2073,10 +2073,18 @@ doCopyAddrToByteArrayOp src_p dst dst_off bytes = do -- character. doSetByteArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> CmmExpr -> FCode () -doSetByteArrayOp ba off len c - = do dflags <- getDynFlags - p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba (arrWordsHdrSize dflags)) off - emitMemsetCall p c len 1 +doSetByteArrayOp ba off len c = do + dflags <- getDynFlags + let maxAlign = wORD_SIZE dflags + align = minimum [maxAlign, possibleAlign] + + p <- assignTempE $ cmmOffsetExpr dflags (cmmOffsetB dflags ba (arrWordsHdrSize dflags)) off + + emitMemsetCall p c len align + where + possibleAlign = case off of + CmmLit (CmmInt intOff _) -> fromIntegral $ byteAlignment (fromIntegral intOff) + _ -> 1 -- ---------------------------------------------------------------------------- -- Allocating arrays diff --git a/compiler/nativeGen/X86/CodeGen.hs b/compiler/nativeGen/X86/CodeGen.hs index 0424b1b..06ebd2a 100644 --- a/compiler/nativeGen/X86/CodeGen.hs +++ b/compiler/nativeGen/X86/CodeGen.hs @@ -1843,22 +1843,32 @@ genCCall dflags is32Bit (PrimTarget (MO_Memcpy align)) _ dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - i)) -genCCall dflags _ (PrimTarget (MO_Memset align)) _ +genCCall dflags is32Bit (PrimTarget (MO_Memset align)) _ [dst, CmmLit (CmmInt c _), CmmLit (CmmInt n _)] _ - | fromInteger insns <= maxInlineMemsetInsns dflags && align .&. 3 == 0 = do + | fromInteger insns <= maxInlineMemsetInsns dflags = do code_dst <- getAnyReg dst dst_r <- getNewRegNat format - return $ code_dst dst_r `appOL` go dst_r (fromInteger n) + if format == II64 && n >= 8 then do + code_imm8byte <- getAnyReg (CmmLit (CmmInt c8 W64)) + imm8byte_r <- getNewRegNat II64 + return $ code_dst dst_r `appOL` + code_imm8byte imm8byte_r `appOL` + go8 dst_r imm8byte_r (fromInteger n) + else + return $ code_dst dst_r `appOL` + go4 dst_r (fromInteger n) where - (format, val) = case align .&. 3 of - 2 -> (II16, c2) - 0 -> (II32, c4) - _ -> (II8, c) + format = case byteAlignment (fromIntegral align) of + 8 -> if is32Bit then II32 else II64 + 4 -> II32 + 2 -> II16 + _ -> II8 c2 = c `shiftL` 8 .|. c c4 = c2 `shiftL` 16 .|. c2 + c8 = c4 `shiftL` 32 .|. c4 -- The number of instructions we will generate (approx). We need 1 -- instructions per move. @@ -1868,25 +1878,45 @@ genCCall dflags _ (PrimTarget (MO_Memset align)) _ sizeBytes :: Integer sizeBytes = fromIntegral (formatInBytes format) - go :: Reg -> Integer -> OrdList Instr - go dst i - -- TODO: Add movabs instruction and support 64-bit sets. - | i >= sizeBytes = -- This might be smaller than the below sizes - unitOL (MOV format (OpImm (ImmInteger val)) (OpAddr dst_addr)) `appOL` - go dst (i - sizeBytes) - | i >= 4 = -- Will never happen on 32-bit - unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr dst_addr)) `appOL` - go dst (i - 4) - | i >= 2 = - unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr dst_addr)) `appOL` - go dst (i - 2) - | i >= 1 = - unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr dst_addr)) `appOL` - go dst (i - 1) - | otherwise = nilOL + -- Depending on size returns the widest MOV instruction and its + -- width. + gen4 :: AddrMode -> Integer -> (InstrBlock, Integer) + gen4 addr size + | size >= 4 = + (unitOL (MOV II32 (OpImm (ImmInteger c4)) (OpAddr addr)), 4) + | size >= 2 = + (unitOL (MOV II16 (OpImm (ImmInteger c2)) (OpAddr addr)), 2) + | size >= 1 = + (unitOL (MOV II8 (OpImm (ImmInteger c)) (OpAddr addr)), 1) + | otherwise = (nilOL, 0) + + -- Generates a 64-bit wide MOV instruction from REG to MEM. + gen8 :: AddrMode -> Reg -> InstrBlock + gen8 addr reg8byte = + unitOL (MOV format (OpReg reg8byte) (OpAddr addr)) + + -- Unrolls memset when the widest MOV is <= 4 bytes. + go4 :: Reg -> Integer -> InstrBlock + go4 dst left = + if left <= 0 then nilOL + else curMov `appOL` go4 dst (left - curWidth) where - dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone - (ImmInteger (n - i)) + possibleWidth = minimum [left, sizeBytes] + dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left)) + (curMov, curWidth) = gen4 dst_addr possibleWidth + + -- Unrolls memset when the widest MOV is 8 bytes (thus another Reg + -- argument). Falls back to go4 when all 8 byte moves are + -- exhausted. + go8 :: Reg -> Reg -> Integer -> InstrBlock + go8 dst reg8byte left = + if possibleWidth >= 8 then + let curMov = gen8 dst_addr reg8byte + in curMov `appOL` go8 dst reg8byte (left - 8) + else go4 dst left + where + possibleWidth = minimum [left, sizeBytes] + dst_addr = AddrBaseIndex (EABaseReg dst) EAIndexNone (ImmInteger (n - left)) genCCall _ _ (PrimTarget MO_WriteBarrier) _ _ _ = return nilOL -- write barrier compiles to no code on x86/x86-64; diff --git a/compiler/utils/Util.hs b/compiler/utils/Util.hs index 9e67a43..6f7a9e5 100644 --- a/compiler/utils/Util.hs +++ b/compiler/utils/Util.hs @@ -87,6 +87,7 @@ module Util ( -- * Integers exactLog2, + byteAlignment, -- * Floating point readRational, @@ -1149,6 +1150,15 @@ exactLog2 x pow2 x | x == 1 = 0 | otherwise = 1 + pow2 (x `shiftR` 1) +-- x is aligned at N bytes means the remainder from x / N is zero. +-- Currently, interested in N <= 8, but can be expanded to N <= 16 or +-- N <= 32 if used within SSE or AVX context. +byteAlignment :: Integer -> Integer +byteAlignment x = case x .&. 7 of + 0 -> 8 + 4 -> 4 + 2 -> 2 + _ -> 1 {- -- ----------------------------------------------------------------------------- diff --git a/testsuite/driver/testlib.py b/testsuite/driver/testlib.py index 3fefb52..95274f3 100644 --- a/testsuite/driver/testlib.py +++ b/testsuite/driver/testlib.py @@ -1131,9 +1131,9 @@ def do_compile(name, way, should_fail, top_mod, extra_mods, extra_hc_opts, **kwa # no problems found, this test passed return passed() -def compile_cmp_asm( name, way, extra_hc_opts ): +def compile_cmp_asm( name, way, ext, extra_hc_opts ): print('Compile only, extra args = ', extra_hc_opts) - result = simple_build(name + '.cmm', way, '-keep-s-files -O ' + extra_hc_opts, 0, '', 0, 0) + result = simple_build(name + '.' + ext, way, '-keep-s-files -O ' + extra_hc_opts, 0, '', 0, 0) if badResult(result): return result @@ -1153,6 +1153,24 @@ def compile_cmp_asm( name, way, extra_hc_opts ): # no problems found, this test passed return passed() +def compile_grep_asm( name, way, ext, is_substring, extra_hc_opts ): + print('Compile only, extra args = ', extra_hc_opts) + result = simple_build(name + '.' + ext, way, '-keep-s-files -O ' + extra_hc_opts, 0, '', 0, 0) + + if badResult(result): + return result + + expected_pat_file = find_expected_file(name, 'asm') + actual_asm_file = add_suffix(name, 's') + + if not grep_output(join_normalisers(normalise_errmsg), + expected_pat_file, actual_asm_file, + is_substring): + return failBecause('asm mismatch') + + # no problems found, this test passed + return passed() + # ----------------------------------------------------------------------------- # Compile-and-run tests @@ -1735,6 +1753,43 @@ def compare_outputs(way, kind, normaliser, expected_file, actual_file, diff_file else: return False +# Checks that each line from pattern_file is present in actual_file as +# a substring or regex pattern depending on is_substring. +def grep_output(normaliser, pattern_file, actual_file, is_substring=True): + expected_path = in_srcdir(pattern_file) + actual_path = in_testdir(actual_file) + + expected_patterns = read_no_crs(expected_path).strip().split('\n') + actual_raw = read_no_crs(actual_path) + actual_str = normaliser(actual_raw) + + success = True + failed_patterns = [] + + def regex_match(pat, actual): + return re.search(pat, actual) is not None + + def substring_match(pat, actual): + return pat in actual + + def is_match(pat, actual): + if is_substring: + return substring_match(pat, actual) + else: + return regex_match(pat, actual) + + for pat in expected_patterns: + if not is_match(pat, actual_str): + success = False + failed_patterns.append(pat) + + if not success: + print('Actual output does not contain the following patterns:') + for pat in failed_patterns: + print(pat) + + return success + # Note [Output comparison] # # We do two types of output comparison: diff --git a/testsuite/tests/codeGen/should_gen_asm/all.T b/testsuite/tests/codeGen/should_gen_asm/all.T index 08a0472..7e35ec3 100644 --- a/testsuite/tests/codeGen/should_gen_asm/all.T +++ b/testsuite/tests/codeGen/should_gen_asm/all.T @@ -3,7 +3,8 @@ is_amd64_codegen = [ when(unregisterised(), skip), ] -test('memcpy', is_amd64_codegen, compile_cmp_asm, ['']) -test('memcpy-unroll', is_amd64_codegen, compile_cmp_asm, ['']) -test('memcpy-unroll-conprop', is_amd64_codegen, compile_cmp_asm, ['']) -test('memset-unroll', is_amd64_codegen, compile_cmp_asm, ['']) +test('memcpy', is_amd64_codegen, compile_cmp_asm, ['cmm', '']) +test('memcpy-unroll', is_amd64_codegen, compile_cmp_asm, ['cmm', '']) +test('memcpy-unroll-conprop', is_amd64_codegen, compile_cmp_asm, ['cmm', '']) +test('memset-unroll', is_amd64_codegen, compile_cmp_asm, ['cmm', '']) +test('bytearray-memset-unroll', is_amd64_codegen, compile_grep_asm, ['hs', True, '']) diff --git a/testsuite/tests/codeGen/should_gen_asm/bytearray-memset-unroll.asm b/testsuite/tests/codeGen/should_gen_asm/bytearray-memset-unroll.asm new file mode 100644 index 0000000..666f36b --- /dev/null +++ b/testsuite/tests/codeGen/should_gen_asm/bytearray-memset-unroll.asm @@ -0,0 +1,6 @@ +movq $72340172838076673,%rcx +movq %rcx,0(%rbx) +movq %rcx,8(%rbx) +movl $16843009,16(%rbx) +movw $257,20(%rbx) +movb $1,22(%rbx) diff --git a/testsuite/tests/codeGen/should_gen_asm/bytearray-memset-unroll.hs b/testsuite/tests/codeGen/should_gen_asm/bytearray-memset-unroll.hs new file mode 100644 index 0000000..b5108d4 --- /dev/null +++ b/testsuite/tests/codeGen/should_gen_asm/bytearray-memset-unroll.hs @@ -0,0 +1,17 @@ +{-# language MagicHash #-} +{-# language UnboxedTuples #-} + +module FillArray + ( fill + ) where + +import GHC.Exts +import GHC.IO + +data ByteArray = ByteArray ByteArray# + +fill :: IO ByteArray +fill = IO $ \s0 -> case newByteArray# 24# s0 of + (# s1, m #) -> case setByteArray# m 0# 23# 1# s1 of + s2 -> case unsafeFreezeByteArray# m s2 of + (# s3, r #) -> (# s3, ByteArray r #) |