mirror of
https://github.com/optim-enterprises-bv/vault.git
synced 2025-11-03 03:58:01 +00:00
Bump deps.
This commit is contained in:
100
vendor/github.com/golang/snappy/README
generated
vendored
100
vendor/github.com/golang/snappy/README
generated
vendored
@@ -5,3 +5,103 @@ $ go get github.com/golang/snappy
|
||||
|
||||
Unless otherwise noted, the Snappy-Go source files are distributed
|
||||
under the BSD-style license found in the LICENSE file.
|
||||
|
||||
|
||||
|
||||
Benchmarks.
|
||||
|
||||
The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
|
||||
or so files, the same set used by the C++ Snappy code (github.com/google/snappy
|
||||
and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
|
||||
3.40GHz", Go's GOARCH=amd64 numbers as of 2016-04-29:
|
||||
|
||||
"go test -test.bench=."
|
||||
|
||||
_UFlat0-8 2.23GB/s ± 1% html
|
||||
_UFlat1-8 1.43GB/s ± 0% urls
|
||||
_UFlat2-8 23.7GB/s ± 1% jpg
|
||||
_UFlat3-8 1.93GB/s ± 0% jpg_200
|
||||
_UFlat4-8 13.9GB/s ± 2% pdf
|
||||
_UFlat5-8 2.00GB/s ± 0% html4
|
||||
_UFlat6-8 829MB/s ± 0% txt1
|
||||
_UFlat7-8 799MB/s ± 0% txt2
|
||||
_UFlat8-8 871MB/s ± 0% txt3
|
||||
_UFlat9-8 730MB/s ± 0% txt4
|
||||
_UFlat10-8 2.87GB/s ± 0% pb
|
||||
_UFlat11-8 1.07GB/s ± 0% gaviota
|
||||
|
||||
_ZFlat0-8 1.04GB/s ± 0% html
|
||||
_ZFlat1-8 536MB/s ± 0% urls
|
||||
_ZFlat2-8 16.3GB/s ± 2% jpg
|
||||
_ZFlat3-8 762MB/s ± 0% jpg_200
|
||||
_ZFlat4-8 9.48GB/s ± 1% pdf
|
||||
_ZFlat5-8 990MB/s ± 0% html4
|
||||
_ZFlat6-8 381MB/s ± 0% txt1
|
||||
_ZFlat7-8 353MB/s ± 0% txt2
|
||||
_ZFlat8-8 398MB/s ± 0% txt3
|
||||
_ZFlat9-8 329MB/s ± 0% txt4
|
||||
_ZFlat10-8 1.35GB/s ± 1% pb
|
||||
_ZFlat11-8 608MB/s ± 0% gaviota
|
||||
|
||||
|
||||
|
||||
"go test -test.bench=. -tags=noasm"
|
||||
|
||||
_UFlat0-8 637MB/s ± 0% html
|
||||
_UFlat1-8 506MB/s ± 0% urls
|
||||
_UFlat2-8 23.0GB/s ± 5% jpg
|
||||
_UFlat3-8 1.17GB/s ± 0% jpg_200
|
||||
_UFlat4-8 4.44GB/s ± 1% pdf
|
||||
_UFlat5-8 623MB/s ± 0% html4
|
||||
_UFlat6-8 300MB/s ± 1% txt1
|
||||
_UFlat7-8 293MB/s ± 0% txt2
|
||||
_UFlat8-8 316MB/s ± 0% txt3
|
||||
_UFlat9-8 285MB/s ± 0% txt4
|
||||
_UFlat10-8 768MB/s ± 0% pb
|
||||
_UFlat11-8 406MB/s ± 1% gaviota
|
||||
|
||||
_ZFlat0-8 411MB/s ± 1% html
|
||||
_ZFlat1-8 250MB/s ± 1% urls
|
||||
_ZFlat2-8 12.7GB/s ± 1% jpg
|
||||
_ZFlat3-8 157MB/s ± 0% jpg_200
|
||||
_ZFlat4-8 2.95GB/s ± 0% pdf
|
||||
_ZFlat5-8 406MB/s ± 0% html4
|
||||
_ZFlat6-8 182MB/s ± 0% txt1
|
||||
_ZFlat7-8 173MB/s ± 1% txt2
|
||||
_ZFlat8-8 191MB/s ± 0% txt3
|
||||
_ZFlat9-8 166MB/s ± 0% txt4
|
||||
_ZFlat10-8 480MB/s ± 0% pb
|
||||
_ZFlat11-8 272MB/s ± 0% gaviota
|
||||
|
||||
|
||||
|
||||
For comparison (Go's encoded output is byte-for-byte identical to C++'s), here
|
||||
are the numbers from C++ Snappy's
|
||||
|
||||
make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log
|
||||
|
||||
BM_UFlat/0 2.4GB/s html
|
||||
BM_UFlat/1 1.4GB/s urls
|
||||
BM_UFlat/2 21.8GB/s jpg
|
||||
BM_UFlat/3 1.5GB/s jpg_200
|
||||
BM_UFlat/4 13.3GB/s pdf
|
||||
BM_UFlat/5 2.1GB/s html4
|
||||
BM_UFlat/6 1.0GB/s txt1
|
||||
BM_UFlat/7 959.4MB/s txt2
|
||||
BM_UFlat/8 1.0GB/s txt3
|
||||
BM_UFlat/9 864.5MB/s txt4
|
||||
BM_UFlat/10 2.9GB/s pb
|
||||
BM_UFlat/11 1.2GB/s gaviota
|
||||
|
||||
BM_ZFlat/0 944.3MB/s html (22.31 %)
|
||||
BM_ZFlat/1 501.6MB/s urls (47.78 %)
|
||||
BM_ZFlat/2 14.3GB/s jpg (99.95 %)
|
||||
BM_ZFlat/3 538.3MB/s jpg_200 (73.00 %)
|
||||
BM_ZFlat/4 8.3GB/s pdf (83.30 %)
|
||||
BM_ZFlat/5 903.5MB/s html4 (22.52 %)
|
||||
BM_ZFlat/6 336.0MB/s txt1 (57.88 %)
|
||||
BM_ZFlat/7 312.3MB/s txt2 (61.91 %)
|
||||
BM_ZFlat/8 353.1MB/s txt3 (54.99 %)
|
||||
BM_ZFlat/9 289.9MB/s txt4 (66.26 %)
|
||||
BM_ZFlat/10 1.2GB/s pb (19.68 %)
|
||||
BM_ZFlat/11 527.4MB/s gaviota (37.72 %)
|
||||
|
||||
380
vendor/github.com/golang/snappy/encode_amd64.s
generated
vendored
380
vendor/github.com/golang/snappy/encode_amd64.s
generated
vendored
@@ -8,10 +8,17 @@
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
// TODO: figure out why the XXX lines compile with Go 1.4 and Go tip but not
|
||||
// Go 1.6.
|
||||
// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
|
||||
// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
|
||||
// https://github.com/golang/snappy/issues/29
|
||||
//
|
||||
// This is https://github.com/golang/snappy/issues/29
|
||||
// As a workaround, the package was built with a known good assembler, and
|
||||
// those instructions were disassembled by "objdump -d" to yield the
|
||||
// 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
|
||||
// style comments, in AT&T asm syntax. Note that rsp here is a physical
|
||||
// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
|
||||
// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
|
||||
// fine on Go 1.6.
|
||||
|
||||
// The asm code generally follows the pure Go code in encode_other.go, except
|
||||
// where marked with a "!!!".
|
||||
@@ -21,19 +28,23 @@
|
||||
// func emitLiteral(dst, lit []byte) int
|
||||
//
|
||||
// All local variables fit into registers. The register allocation:
|
||||
// - AX return value
|
||||
// - AX len(lit)
|
||||
// - BX n
|
||||
// - CX len(lit)
|
||||
// - SI &lit[0]
|
||||
// - DX return value
|
||||
// - DI &dst[i]
|
||||
// - R10 &lit[0]
|
||||
//
|
||||
// The 24 bytes of stack space is to call runtime·memmove.
|
||||
//
|
||||
// The unusual register allocation of local variables, such as R10 for the
|
||||
// source pointer, matches the allocation used at the call site in encodeBlock,
|
||||
// which makes it easier to manually inline this function.
|
||||
TEXT ·emitLiteral(SB), NOSPLIT, $24-56
|
||||
MOVQ dst_base+0(FP), DI
|
||||
MOVQ lit_base+24(FP), SI
|
||||
MOVQ lit_len+32(FP), CX
|
||||
MOVQ CX, AX
|
||||
MOVL CX, BX
|
||||
MOVQ lit_base+24(FP), R10
|
||||
MOVQ lit_len+32(FP), AX
|
||||
MOVQ AX, DX
|
||||
MOVL AX, BX
|
||||
SUBL $1, BX
|
||||
|
||||
CMPL BX, $60
|
||||
@@ -45,32 +56,32 @@ threeBytes:
|
||||
MOVB $0xf4, 0(DI)
|
||||
MOVW BX, 1(DI)
|
||||
ADDQ $3, DI
|
||||
ADDQ $3, AX
|
||||
JMP emitLiteralEnd
|
||||
ADDQ $3, DX
|
||||
JMP memmove
|
||||
|
||||
twoBytes:
|
||||
MOVB $0xf0, 0(DI)
|
||||
MOVB BX, 1(DI)
|
||||
ADDQ $2, DI
|
||||
ADDQ $2, AX
|
||||
JMP emitLiteralEnd
|
||||
ADDQ $2, DX
|
||||
JMP memmove
|
||||
|
||||
oneByte:
|
||||
SHLB $2, BX
|
||||
MOVB BX, 0(DI)
|
||||
ADDQ $1, DI
|
||||
ADDQ $1, AX
|
||||
ADDQ $1, DX
|
||||
|
||||
emitLiteralEnd:
|
||||
MOVQ AX, ret+48(FP)
|
||||
memmove:
|
||||
MOVQ DX, ret+48(FP)
|
||||
|
||||
// copy(dst[i:], lit)
|
||||
//
|
||||
// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
|
||||
// DI, SI and CX as arguments.
|
||||
// DI, R10 and AX as arguments.
|
||||
MOVQ DI, 0(SP)
|
||||
MOVQ SI, 8(SP)
|
||||
MOVQ CX, 16(SP)
|
||||
MOVQ R10, 8(SP)
|
||||
MOVQ AX, 16(SP)
|
||||
CALL runtime·memmove(SB)
|
||||
RET
|
||||
|
||||
@@ -79,55 +90,59 @@ emitLiteralEnd:
|
||||
// func emitCopy(dst []byte, offset, length int) int
|
||||
//
|
||||
// All local variables fit into registers. The register allocation:
|
||||
// - BX offset
|
||||
// - CX length
|
||||
// - AX length
|
||||
// - SI &dst[0]
|
||||
// - DI &dst[i]
|
||||
// - R11 offset
|
||||
//
|
||||
// The unusual register allocation of local variables, such as R11 for the
|
||||
// offset, matches the allocation used at the call site in encodeBlock, which
|
||||
// makes it easier to manually inline this function.
|
||||
TEXT ·emitCopy(SB), NOSPLIT, $0-48
|
||||
MOVQ dst_base+0(FP), DI
|
||||
MOVQ DI, SI
|
||||
MOVQ offset+24(FP), BX
|
||||
MOVQ length+32(FP), CX
|
||||
MOVQ offset+24(FP), R11
|
||||
MOVQ length+32(FP), AX
|
||||
|
||||
loop0:
|
||||
// for length >= 68 { etc }
|
||||
CMPL CX, $68
|
||||
CMPL AX, $68
|
||||
JLT step1
|
||||
|
||||
// Emit a length 64 copy, encoded as 3 bytes.
|
||||
MOVB $0xfe, 0(DI)
|
||||
MOVW BX, 1(DI)
|
||||
MOVW R11, 1(DI)
|
||||
ADDQ $3, DI
|
||||
SUBL $64, CX
|
||||
SUBL $64, AX
|
||||
JMP loop0
|
||||
|
||||
step1:
|
||||
// if length > 64 { etc }
|
||||
CMPL CX, $64
|
||||
CMPL AX, $64
|
||||
JLE step2
|
||||
|
||||
// Emit a length 60 copy, encoded as 3 bytes.
|
||||
MOVB $0xee, 0(DI)
|
||||
MOVW BX, 1(DI)
|
||||
MOVW R11, 1(DI)
|
||||
ADDQ $3, DI
|
||||
SUBL $60, CX
|
||||
SUBL $60, AX
|
||||
|
||||
step2:
|
||||
// if length >= 12 || offset >= 2048 { goto step3 }
|
||||
CMPL CX, $12
|
||||
CMPL AX, $12
|
||||
JGE step3
|
||||
CMPL BX, $2048
|
||||
CMPL R11, $2048
|
||||
JGE step3
|
||||
|
||||
// Emit the remaining copy, encoded as 2 bytes.
|
||||
MOVB BX, 1(DI)
|
||||
SHRL $8, BX
|
||||
SHLB $5, BX
|
||||
SUBB $4, CX
|
||||
SHLB $2, CX
|
||||
ORB CX, BX
|
||||
ORB $1, BX
|
||||
MOVB BX, 0(DI)
|
||||
MOVB R11, 1(DI)
|
||||
SHRL $8, R11
|
||||
SHLB $5, R11
|
||||
SUBB $4, AX
|
||||
SHLB $2, AX
|
||||
ORB AX, R11
|
||||
ORB $1, R11
|
||||
MOVB R11, 0(DI)
|
||||
ADDQ $2, DI
|
||||
|
||||
// Return the number of bytes written.
|
||||
@@ -137,11 +152,11 @@ step2:
|
||||
|
||||
step3:
|
||||
// Emit the remaining copy, encoded as 3 bytes.
|
||||
SUBL $1, CX
|
||||
SHLB $2, CX
|
||||
ORB $2, CX
|
||||
MOVB CX, 0(DI)
|
||||
MOVW BX, 1(DI)
|
||||
SUBL $1, AX
|
||||
SHLB $2, AX
|
||||
ORB $2, AX
|
||||
MOVB AX, 0(DI)
|
||||
MOVW R11, 1(DI)
|
||||
ADDQ $3, DI
|
||||
|
||||
// Return the number of bytes written.
|
||||
@@ -154,33 +169,37 @@ step3:
|
||||
// func extendMatch(src []byte, i, j int) int
|
||||
//
|
||||
// All local variables fit into registers. The register allocation:
|
||||
// - CX &src[0]
|
||||
// - DX &src[len(src)]
|
||||
// - SI &src[i]
|
||||
// - DI &src[j]
|
||||
// - R9 &src[len(src) - 8]
|
||||
// - DX &src[0]
|
||||
// - SI &src[j]
|
||||
// - R13 &src[len(src) - 8]
|
||||
// - R14 &src[len(src)]
|
||||
// - R15 &src[i]
|
||||
//
|
||||
// The unusual register allocation of local variables, such as R15 for a source
|
||||
// pointer, matches the allocation used at the call site in encodeBlock, which
|
||||
// makes it easier to manually inline this function.
|
||||
TEXT ·extendMatch(SB), NOSPLIT, $0-48
|
||||
MOVQ src_base+0(FP), CX
|
||||
MOVQ src_len+8(FP), DX
|
||||
MOVQ i+24(FP), SI
|
||||
MOVQ j+32(FP), DI
|
||||
ADDQ CX, DX
|
||||
ADDQ CX, SI
|
||||
ADDQ CX, DI
|
||||
MOVQ DX, R9
|
||||
SUBQ $8, R9
|
||||
MOVQ src_base+0(FP), DX
|
||||
MOVQ src_len+8(FP), R14
|
||||
MOVQ i+24(FP), R15
|
||||
MOVQ j+32(FP), SI
|
||||
ADDQ DX, R14
|
||||
ADDQ DX, R15
|
||||
ADDQ DX, SI
|
||||
MOVQ R14, R13
|
||||
SUBQ $8, R13
|
||||
|
||||
cmp8:
|
||||
// As long as we are 8 or more bytes before the end of src, we can load and
|
||||
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
|
||||
CMPQ DI, R9
|
||||
CMPQ SI, R13
|
||||
JA cmp1
|
||||
MOVQ (SI), AX
|
||||
MOVQ (DI), BX
|
||||
MOVQ (R15), AX
|
||||
MOVQ (SI), BX
|
||||
CMPQ AX, BX
|
||||
JNE bsf
|
||||
ADDQ $8, R15
|
||||
ADDQ $8, SI
|
||||
ADDQ $8, DI
|
||||
JMP cmp8
|
||||
|
||||
bsf:
|
||||
@@ -191,29 +210,29 @@ bsf:
|
||||
XORQ AX, BX
|
||||
BSFQ BX, BX
|
||||
SHRQ $3, BX
|
||||
ADDQ BX, DI
|
||||
ADDQ BX, SI
|
||||
|
||||
// Convert from &src[ret] to ret.
|
||||
SUBQ CX, DI
|
||||
MOVQ DI, ret+40(FP)
|
||||
SUBQ DX, SI
|
||||
MOVQ SI, ret+40(FP)
|
||||
RET
|
||||
|
||||
cmp1:
|
||||
// In src's tail, compare 1 byte at a time.
|
||||
CMPQ DI, DX
|
||||
CMPQ SI, R14
|
||||
JAE extendMatchEnd
|
||||
MOVB (SI), AX
|
||||
MOVB (DI), BX
|
||||
MOVB (R15), AX
|
||||
MOVB (SI), BX
|
||||
CMPB AX, BX
|
||||
JNE extendMatchEnd
|
||||
ADDQ $1, R15
|
||||
ADDQ $1, SI
|
||||
ADDQ $1, DI
|
||||
JMP cmp1
|
||||
|
||||
extendMatchEnd:
|
||||
// Convert from &src[ret] to ret.
|
||||
SUBQ CX, DI
|
||||
MOVQ DI, ret+40(FP)
|
||||
SUBQ DX, SI
|
||||
MOVQ SI, ret+40(FP)
|
||||
RET
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
@@ -232,8 +251,8 @@ extendMatchEnd:
|
||||
// - R10 . &src[nextEmit]
|
||||
// - R11 96 prevHash, currHash, nextHash, offset
|
||||
// - R12 104 &src[base], skip
|
||||
// - R13 . &src[nextS]
|
||||
// - R14 . len(src), bytesBetweenHashLookups, x
|
||||
// - R13 . &src[nextS], &src[len(src) - 8]
|
||||
// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
|
||||
// - R15 112 candidate
|
||||
//
|
||||
// The second column (56, 64, etc) is the stack offset to spill the registers
|
||||
@@ -352,6 +371,7 @@ inner0:
|
||||
// table[nextHash] = uint16(s)
|
||||
MOVQ SI, AX
|
||||
SUBQ DX, AX
|
||||
|
||||
// XXX: MOVW AX, table-32768(SP)(R11*2)
|
||||
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
|
||||
BYTE $0x66
|
||||
@@ -384,32 +404,63 @@ fourByteMatch:
|
||||
CMPQ AX, $16
|
||||
JLE emitLiteralFastPath
|
||||
|
||||
// d += emitLiteral(dst[d:], src[nextEmit:s])
|
||||
// ----------------------------------------
|
||||
// Begin inline of the emitLiteral call.
|
||||
//
|
||||
// Push args.
|
||||
MOVQ DI, 0(SP)
|
||||
MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
|
||||
MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
|
||||
MOVQ R10, 24(SP)
|
||||
MOVQ AX, 32(SP)
|
||||
MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
|
||||
// d += emitLiteral(dst[d:], src[nextEmit:s])
|
||||
|
||||
MOVL AX, BX
|
||||
SUBL $1, BX
|
||||
|
||||
CMPL BX, $60
|
||||
JLT inlineEmitLiteralOneByte
|
||||
CMPL BX, $256
|
||||
JLT inlineEmitLiteralTwoBytes
|
||||
|
||||
inlineEmitLiteralThreeBytes:
|
||||
MOVB $0xf4, 0(DI)
|
||||
MOVW BX, 1(DI)
|
||||
ADDQ $3, DI
|
||||
JMP inlineEmitLiteralMemmove
|
||||
|
||||
inlineEmitLiteralTwoBytes:
|
||||
MOVB $0xf0, 0(DI)
|
||||
MOVB BX, 1(DI)
|
||||
ADDQ $2, DI
|
||||
JMP inlineEmitLiteralMemmove
|
||||
|
||||
inlineEmitLiteralOneByte:
|
||||
SHLB $2, BX
|
||||
MOVB BX, 0(DI)
|
||||
ADDQ $1, DI
|
||||
|
||||
inlineEmitLiteralMemmove:
|
||||
// Spill local variables (registers) onto the stack; call; unspill.
|
||||
//
|
||||
// copy(dst[i:], lit)
|
||||
//
|
||||
// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
|
||||
// DI, R10 and AX as arguments.
|
||||
MOVQ DI, 0(SP)
|
||||
MOVQ R10, 8(SP)
|
||||
MOVQ AX, 16(SP)
|
||||
ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)".
|
||||
MOVQ SI, 72(SP)
|
||||
MOVQ DI, 80(SP)
|
||||
MOVQ R15, 112(SP)
|
||||
CALL ·emitLiteral(SB)
|
||||
CALL runtime·memmove(SB)
|
||||
MOVQ 56(SP), CX
|
||||
MOVQ 64(SP), DX
|
||||
MOVQ 72(SP), SI
|
||||
MOVQ 80(SP), DI
|
||||
MOVQ 88(SP), R9
|
||||
MOVQ 112(SP), R15
|
||||
|
||||
// Finish the "d +=" part of "d += emitLiteral(etc)".
|
||||
ADDQ 48(SP), DI
|
||||
JMP inner1
|
||||
|
||||
inlineEmitLiteralEnd:
|
||||
// End inline of the emitLiteral call.
|
||||
// ----------------------------------------
|
||||
|
||||
emitLiteralFastPath:
|
||||
// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
|
||||
MOVB AX, BX
|
||||
@@ -442,60 +493,129 @@ inner1:
|
||||
SUBQ R15, R11
|
||||
SUBQ DX, R11
|
||||
|
||||
// ----------------------------------------
|
||||
// Begin inline of the extendMatch call.
|
||||
//
|
||||
// s = extendMatch(src, candidate+4, s+4)
|
||||
//
|
||||
// Push args.
|
||||
MOVQ DX, 0(SP)
|
||||
|
||||
// !!! R14 = &src[len(src)]
|
||||
MOVQ src_len+32(FP), R14
|
||||
MOVQ R14, 8(SP)
|
||||
MOVQ R14, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
|
||||
ADDQ DX, R14
|
||||
|
||||
// !!! R13 = &src[len(src) - 8]
|
||||
MOVQ R14, R13
|
||||
SUBQ $8, R13
|
||||
|
||||
// !!! R15 = &src[candidate + 4]
|
||||
ADDQ $4, R15
|
||||
MOVQ R15, 24(SP)
|
||||
ADDQ DX, R15
|
||||
|
||||
// !!! s += 4
|
||||
ADDQ $4, SI
|
||||
SUBQ DX, SI
|
||||
MOVQ SI, 32(SP)
|
||||
|
||||
// Spill local variables (registers) onto the stack; call; unspill.
|
||||
inlineExtendMatchCmp8:
|
||||
// As long as we are 8 or more bytes before the end of src, we can load and
|
||||
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
|
||||
CMPQ SI, R13
|
||||
JA inlineExtendMatchCmp1
|
||||
MOVQ (R15), AX
|
||||
MOVQ (SI), BX
|
||||
CMPQ AX, BX
|
||||
JNE inlineExtendMatchBSF
|
||||
ADDQ $8, R15
|
||||
ADDQ $8, SI
|
||||
JMP inlineExtendMatchCmp8
|
||||
|
||||
inlineExtendMatchBSF:
|
||||
// If those 8 bytes were not equal, XOR the two 8 byte values, and return
|
||||
// the index of the first byte that differs. The BSF instruction finds the
|
||||
// least significant 1 bit, the amd64 architecture is little-endian, and
|
||||
// the shift by 3 converts a bit index to a byte index.
|
||||
XORQ AX, BX
|
||||
BSFQ BX, BX
|
||||
SHRQ $3, BX
|
||||
ADDQ BX, SI
|
||||
JMP inlineExtendMatchEnd
|
||||
|
||||
inlineExtendMatchCmp1:
|
||||
// In src's tail, compare 1 byte at a time.
|
||||
CMPQ SI, R14
|
||||
JAE inlineExtendMatchEnd
|
||||
MOVB (R15), AX
|
||||
MOVB (SI), BX
|
||||
CMPB AX, BX
|
||||
JNE inlineExtendMatchEnd
|
||||
ADDQ $1, R15
|
||||
ADDQ $1, SI
|
||||
JMP inlineExtendMatchCmp1
|
||||
|
||||
inlineExtendMatchEnd:
|
||||
// End inline of the extendMatch call.
|
||||
// ----------------------------------------
|
||||
|
||||
// ----------------------------------------
|
||||
// Begin inline of the emitCopy call.
|
||||
//
|
||||
// We don't need to unspill CX or R9 as we are just about to call another
|
||||
// function.
|
||||
MOVQ DI, 80(SP)
|
||||
MOVQ R11, 96(SP)
|
||||
MOVQ R12, 104(SP)
|
||||
CALL ·extendMatch(SB)
|
||||
MOVQ 64(SP), DX
|
||||
MOVQ 80(SP), DI
|
||||
MOVQ 96(SP), R11
|
||||
MOVQ 104(SP), R12
|
||||
|
||||
// Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
|
||||
// register holds &src[s], not s.
|
||||
MOVQ 40(SP), SI
|
||||
ADDQ DX, SI
|
||||
|
||||
// d += emitCopy(dst[d:], base-candidate, s-base)
|
||||
//
|
||||
// Push args.
|
||||
MOVQ DI, 0(SP)
|
||||
MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
|
||||
MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
|
||||
MOVQ R11, 24(SP)
|
||||
|
||||
// !!! length := s - base
|
||||
MOVQ SI, AX
|
||||
SUBQ R12, AX
|
||||
MOVQ AX, 32(SP)
|
||||
|
||||
// Spill local variables (registers) onto the stack; call; unspill.
|
||||
MOVQ SI, 72(SP)
|
||||
MOVQ DI, 80(SP)
|
||||
CALL ·emitCopy(SB)
|
||||
MOVQ 56(SP), CX
|
||||
MOVQ 64(SP), DX
|
||||
MOVQ 72(SP), SI
|
||||
MOVQ 80(SP), DI
|
||||
MOVQ 88(SP), R9
|
||||
inlineEmitCopyLoop0:
|
||||
// for length >= 68 { etc }
|
||||
CMPL AX, $68
|
||||
JLT inlineEmitCopyStep1
|
||||
|
||||
// Finish the "d +=" part of "d += emitCopy(etc)".
|
||||
ADDQ 40(SP), DI
|
||||
// Emit a length 64 copy, encoded as 3 bytes.
|
||||
MOVB $0xfe, 0(DI)
|
||||
MOVW R11, 1(DI)
|
||||
ADDQ $3, DI
|
||||
SUBL $64, AX
|
||||
JMP inlineEmitCopyLoop0
|
||||
|
||||
inlineEmitCopyStep1:
|
||||
// if length > 64 { etc }
|
||||
CMPL AX, $64
|
||||
JLE inlineEmitCopyStep2
|
||||
|
||||
// Emit a length 60 copy, encoded as 3 bytes.
|
||||
MOVB $0xee, 0(DI)
|
||||
MOVW R11, 1(DI)
|
||||
ADDQ $3, DI
|
||||
SUBL $60, AX
|
||||
|
||||
inlineEmitCopyStep2:
|
||||
// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
|
||||
CMPL AX, $12
|
||||
JGE inlineEmitCopyStep3
|
||||
CMPL R11, $2048
|
||||
JGE inlineEmitCopyStep3
|
||||
|
||||
// Emit the remaining copy, encoded as 2 bytes.
|
||||
MOVB R11, 1(DI)
|
||||
SHRL $8, R11
|
||||
SHLB $5, R11
|
||||
SUBB $4, AX
|
||||
SHLB $2, AX
|
||||
ORB AX, R11
|
||||
ORB $1, R11
|
||||
MOVB R11, 0(DI)
|
||||
ADDQ $2, DI
|
||||
JMP inlineEmitCopyEnd
|
||||
|
||||
inlineEmitCopyStep3:
|
||||
// Emit the remaining copy, encoded as 3 bytes.
|
||||
SUBL $1, AX
|
||||
SHLB $2, AX
|
||||
ORB $2, AX
|
||||
MOVB AX, 0(DI)
|
||||
MOVW R11, 1(DI)
|
||||
ADDQ $3, DI
|
||||
|
||||
inlineEmitCopyEnd:
|
||||
// End inline of the emitCopy call.
|
||||
// ----------------------------------------
|
||||
|
||||
// nextEmit = s
|
||||
MOVQ SI, R10
|
||||
@@ -522,6 +642,7 @@ inner1:
|
||||
MOVQ SI, AX
|
||||
SUBQ DX, AX
|
||||
SUBQ $1, AX
|
||||
|
||||
// XXX: MOVW AX, table-32768(SP)(R11*2)
|
||||
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
|
||||
BYTE $0x66
|
||||
@@ -549,6 +670,7 @@ inner1:
|
||||
|
||||
// table[currHash] = uint16(s)
|
||||
ADDQ $1, AX
|
||||
|
||||
// XXX: MOVW AX, table-32768(SP)(R11*2)
|
||||
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
|
||||
BYTE $0x66
|
||||
|
||||
Reference in New Issue
Block a user