Bump deps.

This commit is contained in:
Jeff Mitchell
2016-05-02 20:11:05 -04:00
parent 5f47acad1c
commit e2091c34e7
30 changed files with 1206 additions and 441 deletions

View File

@@ -5,3 +5,103 @@ $ go get github.com/golang/snappy
Unless otherwise noted, the Snappy-Go source files are distributed
under the BSD-style license found in the LICENSE file.
Benchmarks.
The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
or so files, the same set used by the C++ Snappy code (github.com/google/snappy
and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
3.40GHz", Go's GOARCH=amd64 numbers as of 2016-04-29:
"go test -test.bench=."
_UFlat0-8 2.23GB/s ± 1% html
_UFlat1-8 1.43GB/s ± 0% urls
_UFlat2-8 23.7GB/s ± 1% jpg
_UFlat3-8 1.93GB/s ± 0% jpg_200
_UFlat4-8 13.9GB/s ± 2% pdf
_UFlat5-8 2.00GB/s ± 0% html4
_UFlat6-8 829MB/s ± 0% txt1
_UFlat7-8 799MB/s ± 0% txt2
_UFlat8-8 871MB/s ± 0% txt3
_UFlat9-8 730MB/s ± 0% txt4
_UFlat10-8 2.87GB/s ± 0% pb
_UFlat11-8 1.07GB/s ± 0% gaviota
_ZFlat0-8 1.04GB/s ± 0% html
_ZFlat1-8 536MB/s ± 0% urls
_ZFlat2-8 16.3GB/s ± 2% jpg
_ZFlat3-8 762MB/s ± 0% jpg_200
_ZFlat4-8 9.48GB/s ± 1% pdf
_ZFlat5-8 990MB/s ± 0% html4
_ZFlat6-8 381MB/s ± 0% txt1
_ZFlat7-8 353MB/s ± 0% txt2
_ZFlat8-8 398MB/s ± 0% txt3
_ZFlat9-8 329MB/s ± 0% txt4
_ZFlat10-8 1.35GB/s ± 1% pb
_ZFlat11-8 608MB/s ± 0% gaviota
"go test -test.bench=. -tags=noasm"
_UFlat0-8 637MB/s ± 0% html
_UFlat1-8 506MB/s ± 0% urls
_UFlat2-8 23.0GB/s ± 5% jpg
_UFlat3-8 1.17GB/s ± 0% jpg_200
_UFlat4-8 4.44GB/s ± 1% pdf
_UFlat5-8 623MB/s ± 0% html4
_UFlat6-8 300MB/s ± 1% txt1
_UFlat7-8 293MB/s ± 0% txt2
_UFlat8-8 316MB/s ± 0% txt3
_UFlat9-8 285MB/s ± 0% txt4
_UFlat10-8 768MB/s ± 0% pb
_UFlat11-8 406MB/s ± 1% gaviota
_ZFlat0-8 411MB/s ± 1% html
_ZFlat1-8 250MB/s ± 1% urls
_ZFlat2-8 12.7GB/s ± 1% jpg
_ZFlat3-8 157MB/s ± 0% jpg_200
_ZFlat4-8 2.95GB/s ± 0% pdf
_ZFlat5-8 406MB/s ± 0% html4
_ZFlat6-8 182MB/s ± 0% txt1
_ZFlat7-8 173MB/s ± 1% txt2
_ZFlat8-8 191MB/s ± 0% txt3
_ZFlat9-8 166MB/s ± 0% txt4
_ZFlat10-8 480MB/s ± 0% pb
_ZFlat11-8 272MB/s ± 0% gaviota
For comparison (Go's encoded output is byte-for-byte identical to C++'s), here
are the numbers from C++ Snappy's
make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log
BM_UFlat/0 2.4GB/s html
BM_UFlat/1 1.4GB/s urls
BM_UFlat/2 21.8GB/s jpg
BM_UFlat/3 1.5GB/s jpg_200
BM_UFlat/4 13.3GB/s pdf
BM_UFlat/5 2.1GB/s html4
BM_UFlat/6 1.0GB/s txt1
BM_UFlat/7 959.4MB/s txt2
BM_UFlat/8 1.0GB/s txt3
BM_UFlat/9 864.5MB/s txt4
BM_UFlat/10 2.9GB/s pb
BM_UFlat/11 1.2GB/s gaviota
BM_ZFlat/0 944.3MB/s html (22.31 %)
BM_ZFlat/1 501.6MB/s urls (47.78 %)
BM_ZFlat/2 14.3GB/s jpg (99.95 %)
BM_ZFlat/3 538.3MB/s jpg_200 (73.00 %)
BM_ZFlat/4 8.3GB/s pdf (83.30 %)
BM_ZFlat/5 903.5MB/s html4 (22.52 %)
BM_ZFlat/6 336.0MB/s txt1 (57.88 %)
BM_ZFlat/7 312.3MB/s txt2 (61.91 %)
BM_ZFlat/8 353.1MB/s txt3 (54.99 %)
BM_ZFlat/9 289.9MB/s txt4 (66.26 %)
BM_ZFlat/10 1.2GB/s pb (19.68 %)
BM_ZFlat/11 527.4MB/s gaviota (37.72 %)

View File

@@ -8,10 +8,17 @@
#include "textflag.h"
// TODO: figure out why the XXX lines compile with Go 1.4 and Go tip but not
// Go 1.6.
// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
// https://github.com/golang/snappy/issues/29
//
// This is https://github.com/golang/snappy/issues/29
// As a workaround, the package was built with a known good assembler, and
// those instructions were disassembled by "objdump -d" to yield the
// 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
// style comments, in AT&T asm syntax. Note that rsp here is a physical
// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
// fine on Go 1.6.
// The asm code generally follows the pure Go code in encode_other.go, except
// where marked with a "!!!".
@@ -21,19 +28,23 @@
// func emitLiteral(dst, lit []byte) int
//
// All local variables fit into registers. The register allocation:
// - AX return value
// - AX len(lit)
// - BX n
// - CX len(lit)
// - SI &lit[0]
// - DX return value
// - DI &dst[i]
// - R10 &lit[0]
//
// The 24 bytes of stack space is to call runtime·memmove.
//
// The unusual register allocation of local variables, such as R10 for the
// source pointer, matches the allocation used at the call site in encodeBlock,
// which makes it easier to manually inline this function.
TEXT ·emitLiteral(SB), NOSPLIT, $24-56
MOVQ dst_base+0(FP), DI
MOVQ lit_base+24(FP), SI
MOVQ lit_len+32(FP), CX
MOVQ CX, AX
MOVL CX, BX
MOVQ lit_base+24(FP), R10
MOVQ lit_len+32(FP), AX
MOVQ AX, DX
MOVL AX, BX
SUBL $1, BX
CMPL BX, $60
@@ -45,32 +56,32 @@ threeBytes:
MOVB $0xf4, 0(DI)
MOVW BX, 1(DI)
ADDQ $3, DI
ADDQ $3, AX
JMP emitLiteralEnd
ADDQ $3, DX
JMP memmove
twoBytes:
MOVB $0xf0, 0(DI)
MOVB BX, 1(DI)
ADDQ $2, DI
ADDQ $2, AX
JMP emitLiteralEnd
ADDQ $2, DX
JMP memmove
oneByte:
SHLB $2, BX
MOVB BX, 0(DI)
ADDQ $1, DI
ADDQ $1, AX
ADDQ $1, DX
emitLiteralEnd:
MOVQ AX, ret+48(FP)
memmove:
MOVQ DX, ret+48(FP)
// copy(dst[i:], lit)
//
// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
// DI, SI and CX as arguments.
// DI, R10 and AX as arguments.
MOVQ DI, 0(SP)
MOVQ SI, 8(SP)
MOVQ CX, 16(SP)
MOVQ R10, 8(SP)
MOVQ AX, 16(SP)
CALL runtime·memmove(SB)
RET
@@ -79,55 +90,59 @@ emitLiteralEnd:
// func emitCopy(dst []byte, offset, length int) int
//
// All local variables fit into registers. The register allocation:
// - BX offset
// - CX length
// - AX length
// - SI &dst[0]
// - DI &dst[i]
// - R11 offset
//
// The unusual register allocation of local variables, such as R11 for the
// offset, matches the allocation used at the call site in encodeBlock, which
// makes it easier to manually inline this function.
TEXT ·emitCopy(SB), NOSPLIT, $0-48
MOVQ dst_base+0(FP), DI
MOVQ DI, SI
MOVQ offset+24(FP), BX
MOVQ length+32(FP), CX
MOVQ offset+24(FP), R11
MOVQ length+32(FP), AX
loop0:
// for length >= 68 { etc }
CMPL CX, $68
CMPL AX, $68
JLT step1
// Emit a length 64 copy, encoded as 3 bytes.
MOVB $0xfe, 0(DI)
MOVW BX, 1(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
SUBL $64, CX
SUBL $64, AX
JMP loop0
step1:
// if length > 64 { etc }
CMPL CX, $64
CMPL AX, $64
JLE step2
// Emit a length 60 copy, encoded as 3 bytes.
MOVB $0xee, 0(DI)
MOVW BX, 1(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
SUBL $60, CX
SUBL $60, AX
step2:
// if length >= 12 || offset >= 2048 { goto step3 }
CMPL CX, $12
CMPL AX, $12
JGE step3
CMPL BX, $2048
CMPL R11, $2048
JGE step3
// Emit the remaining copy, encoded as 2 bytes.
MOVB BX, 1(DI)
SHRL $8, BX
SHLB $5, BX
SUBB $4, CX
SHLB $2, CX
ORB CX, BX
ORB $1, BX
MOVB BX, 0(DI)
MOVB R11, 1(DI)
SHRL $8, R11
SHLB $5, R11
SUBB $4, AX
SHLB $2, AX
ORB AX, R11
ORB $1, R11
MOVB R11, 0(DI)
ADDQ $2, DI
// Return the number of bytes written.
@@ -137,11 +152,11 @@ step2:
step3:
// Emit the remaining copy, encoded as 3 bytes.
SUBL $1, CX
SHLB $2, CX
ORB $2, CX
MOVB CX, 0(DI)
MOVW BX, 1(DI)
SUBL $1, AX
SHLB $2, AX
ORB $2, AX
MOVB AX, 0(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
// Return the number of bytes written.
@@ -154,33 +169,37 @@ step3:
// func extendMatch(src []byte, i, j int) int
//
// All local variables fit into registers. The register allocation:
// - CX &src[0]
// - DX &src[len(src)]
// - SI &src[i]
// - DI &src[j]
// - R9 &src[len(src) - 8]
// - DX &src[0]
// - SI &src[j]
// - R13 &src[len(src) - 8]
// - R14 &src[len(src)]
// - R15 &src[i]
//
// The unusual register allocation of local variables, such as R15 for a source
// pointer, matches the allocation used at the call site in encodeBlock, which
// makes it easier to manually inline this function.
TEXT ·extendMatch(SB), NOSPLIT, $0-48
MOVQ src_base+0(FP), CX
MOVQ src_len+8(FP), DX
MOVQ i+24(FP), SI
MOVQ j+32(FP), DI
ADDQ CX, DX
ADDQ CX, SI
ADDQ CX, DI
MOVQ DX, R9
SUBQ $8, R9
MOVQ src_base+0(FP), DX
MOVQ src_len+8(FP), R14
MOVQ i+24(FP), R15
MOVQ j+32(FP), SI
ADDQ DX, R14
ADDQ DX, R15
ADDQ DX, SI
MOVQ R14, R13
SUBQ $8, R13
cmp8:
// As long as we are 8 or more bytes before the end of src, we can load and
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
CMPQ DI, R9
CMPQ SI, R13
JA cmp1
MOVQ (SI), AX
MOVQ (DI), BX
MOVQ (R15), AX
MOVQ (SI), BX
CMPQ AX, BX
JNE bsf
ADDQ $8, R15
ADDQ $8, SI
ADDQ $8, DI
JMP cmp8
bsf:
@@ -191,29 +210,29 @@ bsf:
XORQ AX, BX
BSFQ BX, BX
SHRQ $3, BX
ADDQ BX, DI
ADDQ BX, SI
// Convert from &src[ret] to ret.
SUBQ CX, DI
MOVQ DI, ret+40(FP)
SUBQ DX, SI
MOVQ SI, ret+40(FP)
RET
cmp1:
// In src's tail, compare 1 byte at a time.
CMPQ DI, DX
CMPQ SI, R14
JAE extendMatchEnd
MOVB (SI), AX
MOVB (DI), BX
MOVB (R15), AX
MOVB (SI), BX
CMPB AX, BX
JNE extendMatchEnd
ADDQ $1, R15
ADDQ $1, SI
ADDQ $1, DI
JMP cmp1
extendMatchEnd:
// Convert from &src[ret] to ret.
SUBQ CX, DI
MOVQ DI, ret+40(FP)
SUBQ DX, SI
MOVQ SI, ret+40(FP)
RET
// ----------------------------------------------------------------------------
@@ -232,8 +251,8 @@ extendMatchEnd:
// - R10 . &src[nextEmit]
// - R11 96 prevHash, currHash, nextHash, offset
// - R12 104 &src[base], skip
// - R13 . &src[nextS]
// - R14 . len(src), bytesBetweenHashLookups, x
// - R13 . &src[nextS], &src[len(src) - 8]
// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
// - R15 112 candidate
//
// The second column (56, 64, etc) is the stack offset to spill the registers
@@ -352,6 +371,7 @@ inner0:
// table[nextHash] = uint16(s)
MOVQ SI, AX
SUBQ DX, AX
// XXX: MOVW AX, table-32768(SP)(R11*2)
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
BYTE $0x66
@@ -384,32 +404,63 @@ fourByteMatch:
CMPQ AX, $16
JLE emitLiteralFastPath
// d += emitLiteral(dst[d:], src[nextEmit:s])
// ----------------------------------------
// Begin inline of the emitLiteral call.
//
// Push args.
MOVQ DI, 0(SP)
MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
MOVQ R10, 24(SP)
MOVQ AX, 32(SP)
MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
// d += emitLiteral(dst[d:], src[nextEmit:s])
MOVL AX, BX
SUBL $1, BX
CMPL BX, $60
JLT inlineEmitLiteralOneByte
CMPL BX, $256
JLT inlineEmitLiteralTwoBytes
inlineEmitLiteralThreeBytes:
MOVB $0xf4, 0(DI)
MOVW BX, 1(DI)
ADDQ $3, DI
JMP inlineEmitLiteralMemmove
inlineEmitLiteralTwoBytes:
MOVB $0xf0, 0(DI)
MOVB BX, 1(DI)
ADDQ $2, DI
JMP inlineEmitLiteralMemmove
inlineEmitLiteralOneByte:
SHLB $2, BX
MOVB BX, 0(DI)
ADDQ $1, DI
inlineEmitLiteralMemmove:
// Spill local variables (registers) onto the stack; call; unspill.
//
// copy(dst[i:], lit)
//
// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
// DI, R10 and AX as arguments.
MOVQ DI, 0(SP)
MOVQ R10, 8(SP)
MOVQ AX, 16(SP)
ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)".
MOVQ SI, 72(SP)
MOVQ DI, 80(SP)
MOVQ R15, 112(SP)
CALL ·emitLiteral(SB)
CALL runtime·memmove(SB)
MOVQ 56(SP), CX
MOVQ 64(SP), DX
MOVQ 72(SP), SI
MOVQ 80(SP), DI
MOVQ 88(SP), R9
MOVQ 112(SP), R15
// Finish the "d +=" part of "d += emitLiteral(etc)".
ADDQ 48(SP), DI
JMP inner1
inlineEmitLiteralEnd:
// End inline of the emitLiteral call.
// ----------------------------------------
emitLiteralFastPath:
// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
MOVB AX, BX
@@ -442,60 +493,129 @@ inner1:
SUBQ R15, R11
SUBQ DX, R11
// ----------------------------------------
// Begin inline of the extendMatch call.
//
// s = extendMatch(src, candidate+4, s+4)
//
// Push args.
MOVQ DX, 0(SP)
// !!! R14 = &src[len(src)]
MOVQ src_len+32(FP), R14
MOVQ R14, 8(SP)
MOVQ R14, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
ADDQ DX, R14
// !!! R13 = &src[len(src) - 8]
MOVQ R14, R13
SUBQ $8, R13
// !!! R15 = &src[candidate + 4]
ADDQ $4, R15
MOVQ R15, 24(SP)
ADDQ DX, R15
// !!! s += 4
ADDQ $4, SI
SUBQ DX, SI
MOVQ SI, 32(SP)
// Spill local variables (registers) onto the stack; call; unspill.
inlineExtendMatchCmp8:
// As long as we are 8 or more bytes before the end of src, we can load and
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
CMPQ SI, R13
JA inlineExtendMatchCmp1
MOVQ (R15), AX
MOVQ (SI), BX
CMPQ AX, BX
JNE inlineExtendMatchBSF
ADDQ $8, R15
ADDQ $8, SI
JMP inlineExtendMatchCmp8
inlineExtendMatchBSF:
// If those 8 bytes were not equal, XOR the two 8 byte values, and return
// the index of the first byte that differs. The BSF instruction finds the
// least significant 1 bit, the amd64 architecture is little-endian, and
// the shift by 3 converts a bit index to a byte index.
XORQ AX, BX
BSFQ BX, BX
SHRQ $3, BX
ADDQ BX, SI
JMP inlineExtendMatchEnd
inlineExtendMatchCmp1:
// In src's tail, compare 1 byte at a time.
CMPQ SI, R14
JAE inlineExtendMatchEnd
MOVB (R15), AX
MOVB (SI), BX
CMPB AX, BX
JNE inlineExtendMatchEnd
ADDQ $1, R15
ADDQ $1, SI
JMP inlineExtendMatchCmp1
inlineExtendMatchEnd:
// End inline of the extendMatch call.
// ----------------------------------------
// ----------------------------------------
// Begin inline of the emitCopy call.
//
// We don't need to unspill CX or R9 as we are just about to call another
// function.
MOVQ DI, 80(SP)
MOVQ R11, 96(SP)
MOVQ R12, 104(SP)
CALL ·extendMatch(SB)
MOVQ 64(SP), DX
MOVQ 80(SP), DI
MOVQ 96(SP), R11
MOVQ 104(SP), R12
// Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
// register holds &src[s], not s.
MOVQ 40(SP), SI
ADDQ DX, SI
// d += emitCopy(dst[d:], base-candidate, s-base)
//
// Push args.
MOVQ DI, 0(SP)
MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
MOVQ R11, 24(SP)
// !!! length := s - base
MOVQ SI, AX
SUBQ R12, AX
MOVQ AX, 32(SP)
// Spill local variables (registers) onto the stack; call; unspill.
MOVQ SI, 72(SP)
MOVQ DI, 80(SP)
CALL ·emitCopy(SB)
MOVQ 56(SP), CX
MOVQ 64(SP), DX
MOVQ 72(SP), SI
MOVQ 80(SP), DI
MOVQ 88(SP), R9
inlineEmitCopyLoop0:
// for length >= 68 { etc }
CMPL AX, $68
JLT inlineEmitCopyStep1
// Finish the "d +=" part of "d += emitCopy(etc)".
ADDQ 40(SP), DI
// Emit a length 64 copy, encoded as 3 bytes.
MOVB $0xfe, 0(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
SUBL $64, AX
JMP inlineEmitCopyLoop0
inlineEmitCopyStep1:
// if length > 64 { etc }
CMPL AX, $64
JLE inlineEmitCopyStep2
// Emit a length 60 copy, encoded as 3 bytes.
MOVB $0xee, 0(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
SUBL $60, AX
inlineEmitCopyStep2:
// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
CMPL AX, $12
JGE inlineEmitCopyStep3
CMPL R11, $2048
JGE inlineEmitCopyStep3
// Emit the remaining copy, encoded as 2 bytes.
MOVB R11, 1(DI)
SHRL $8, R11
SHLB $5, R11
SUBB $4, AX
SHLB $2, AX
ORB AX, R11
ORB $1, R11
MOVB R11, 0(DI)
ADDQ $2, DI
JMP inlineEmitCopyEnd
inlineEmitCopyStep3:
// Emit the remaining copy, encoded as 3 bytes.
SUBL $1, AX
SHLB $2, AX
ORB $2, AX
MOVB AX, 0(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
inlineEmitCopyEnd:
// End inline of the emitCopy call.
// ----------------------------------------
// nextEmit = s
MOVQ SI, R10
@@ -522,6 +642,7 @@ inner1:
MOVQ SI, AX
SUBQ DX, AX
SUBQ $1, AX
// XXX: MOVW AX, table-32768(SP)(R11*2)
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
BYTE $0x66
@@ -549,6 +670,7 @@ inner1:
// table[currHash] = uint16(s)
ADDQ $1, AX
// XXX: MOVW AX, table-32768(SP)(R11*2)
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
BYTE $0x66