Bump deps.

2025-11-03 03:58:01 +00:00 · 2016-05-02 20:11:05 -04:00
parent 5f47acad1c
commit e2091c34e7
30 changed files with 1206 additions and 441 deletions
--- a/vendor/github.com/golang/snappy/README
+++ b/vendor/github.com/golang/snappy/README
@@ -5,3 +5,103 @@ $ go get github.com/golang/snappy

 Unless otherwise noted, the Snappy-Go source files are distributed
 under the BSD-style license found in the LICENSE file.
+
+
+
+Benchmarks.
+
+The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
+or so files, the same set used by the C++ Snappy code (github.com/google/snappy
+and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
+3.40GHz", Go's GOARCH=amd64 numbers as of 2016-04-29:
+
+"go test -test.bench=."
+
+_UFlat0-8         2.23GB/s ± 1%  html
+_UFlat1-8         1.43GB/s ± 0%  urls
+_UFlat2-8         23.7GB/s ± 1%  jpg
+_UFlat3-8         1.93GB/s ± 0%  jpg_200
+_UFlat4-8         13.9GB/s ± 2%  pdf
+_UFlat5-8         2.00GB/s ± 0%  html4
+_UFlat6-8          829MB/s ± 0%  txt1
+_UFlat7-8          799MB/s ± 0%  txt2
+_UFlat8-8          871MB/s ± 0%  txt3
+_UFlat9-8          730MB/s ± 0%  txt4
+_UFlat10-8        2.87GB/s ± 0%  pb
+_UFlat11-8        1.07GB/s ± 0%  gaviota
+
+_ZFlat0-8         1.04GB/s ± 0%  html
+_ZFlat1-8          536MB/s ± 0%  urls
+_ZFlat2-8         16.3GB/s ± 2%  jpg
+_ZFlat3-8          762MB/s ± 0%  jpg_200
+_ZFlat4-8         9.48GB/s ± 1%  pdf
+_ZFlat5-8          990MB/s ± 0%  html4
+_ZFlat6-8          381MB/s ± 0%  txt1
+_ZFlat7-8          353MB/s ± 0%  txt2
+_ZFlat8-8          398MB/s ± 0%  txt3
+_ZFlat9-8          329MB/s ± 0%  txt4
+_ZFlat10-8        1.35GB/s ± 1%  pb
+_ZFlat11-8         608MB/s ± 0%  gaviota
+
+
+
+"go test -test.bench=. -tags=noasm"
+
+_UFlat0-8          637MB/s ± 0%  html
+_UFlat1-8          506MB/s ± 0%  urls
+_UFlat2-8         23.0GB/s ± 5%  jpg
+_UFlat3-8         1.17GB/s ± 0%  jpg_200
+_UFlat4-8         4.44GB/s ± 1%  pdf
+_UFlat5-8          623MB/s ± 0%  html4
+_UFlat6-8          300MB/s ± 1%  txt1
+_UFlat7-8          293MB/s ± 0%  txt2
+_UFlat8-8          316MB/s ± 0%  txt3
+_UFlat9-8          285MB/s ± 0%  txt4
+_UFlat10-8         768MB/s ± 0%  pb
+_UFlat11-8         406MB/s ± 1%  gaviota
+
+_ZFlat0-8          411MB/s ± 1%  html
+_ZFlat1-8          250MB/s ± 1%  urls
+_ZFlat2-8         12.7GB/s ± 1%  jpg
+_ZFlat3-8          157MB/s ± 0%  jpg_200
+_ZFlat4-8         2.95GB/s ± 0%  pdf
+_ZFlat5-8          406MB/s ± 0%  html4
+_ZFlat6-8          182MB/s ± 0%  txt1
+_ZFlat7-8          173MB/s ± 1%  txt2
+_ZFlat8-8          191MB/s ± 0%  txt3
+_ZFlat9-8          166MB/s ± 0%  txt4
+_ZFlat10-8         480MB/s ± 0%  pb
+_ZFlat11-8         272MB/s ± 0%  gaviota
+
+
+
+For comparison (Go's encoded output is byte-for-byte identical to C++'s), here
+are the numbers from C++ Snappy's
+
+make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log
+
+BM_UFlat/0     2.4GB/s  html
+BM_UFlat/1     1.4GB/s  urls
+BM_UFlat/2    21.8GB/s  jpg
+BM_UFlat/3     1.5GB/s  jpg_200
+BM_UFlat/4    13.3GB/s  pdf
+BM_UFlat/5     2.1GB/s  html4
+BM_UFlat/6     1.0GB/s  txt1
+BM_UFlat/7   959.4MB/s  txt2
+BM_UFlat/8     1.0GB/s  txt3
+BM_UFlat/9   864.5MB/s  txt4
+BM_UFlat/10    2.9GB/s  pb
+BM_UFlat/11    1.2GB/s  gaviota
+
+BM_ZFlat/0   944.3MB/s  html (22.31 %)
+BM_ZFlat/1   501.6MB/s  urls (47.78 %)
+BM_ZFlat/2    14.3GB/s  jpg (99.95 %)
+BM_ZFlat/3   538.3MB/s  jpg_200 (73.00 %)
+BM_ZFlat/4     8.3GB/s  pdf (83.30 %)
+BM_ZFlat/5   903.5MB/s  html4 (22.52 %)
+BM_ZFlat/6   336.0MB/s  txt1 (57.88 %)
+BM_ZFlat/7   312.3MB/s  txt2 (61.91 %)
+BM_ZFlat/8   353.1MB/s  txt3 (54.99 %)
+BM_ZFlat/9   289.9MB/s  txt4 (66.26 %)
+BM_ZFlat/10    1.2GB/s  pb (19.68 %)
+BM_ZFlat/11  527.4MB/s  gaviota (37.72 %)
--- a/vendor/github.com/golang/snappy/encode_amd64.s
+++ b/vendor/github.com/golang/snappy/encode_amd64.s
@@ -8,10 +8,17 @@

 #include "textflag.h"

-// TODO: figure out why the XXX lines compile with Go 1.4 and Go tip but not
-// Go 1.6.
+// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
+// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
+// https://github.com/golang/snappy/issues/29
 //
-// This is https://github.com/golang/snappy/issues/29
+// As a workaround, the package was built with a known good assembler, and
+// those instructions were disassembled by "objdump -d" to yield the
+//	4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+// style comments, in AT&T asm syntax. Note that rsp here is a physical
+// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
+// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
+// fine on Go 1.6.

 // The asm code generally follows the pure Go code in encode_other.go, except
 // where marked with a "!!!".
@@ -21,19 +28,23 @@
 // func emitLiteral(dst, lit []byte) int
 //
 // All local variables fit into registers. The register allocation:
-//	- AX	return value
+//	- AX	len(lit)
 //	- BX	n
-//	- CX	len(lit)
-//	- SI	&lit[0]
+//	- DX	return value
 //	- DI	&dst[i]
+//	- R10	&lit[0]
 //
 // The 24 bytes of stack space is to call runtime·memmove.
+//
+// The unusual register allocation of local variables, such as R10 for the
+// source pointer, matches the allocation used at the call site in encodeBlock,
+// which makes it easier to manually inline this function.
 TEXT ·emitLiteral(SB), NOSPLIT, $24-56
 	MOVQ dst_base+0(FP), DI
-	MOVQ lit_base+24(FP), SI
-	MOVQ lit_len+32(FP), CX
-	MOVQ CX, AX
-	MOVL CX, BX
+	MOVQ lit_base+24(FP), R10
+	MOVQ lit_len+32(FP), AX
+	MOVQ AX, DX
+	MOVL AX, BX
 	SUBL $1, BX

 	CMPL BX, $60
@@ -45,32 +56,32 @@ threeBytes:
 	MOVB $0xf4, 0(DI)
 	MOVW BX, 1(DI)
 	ADDQ $3, DI
-	ADDQ $3, AX
-	JMP  emitLiteralEnd
+	ADDQ $3, DX
+	JMP  memmove

 twoBytes:
 	MOVB $0xf0, 0(DI)
 	MOVB BX, 1(DI)
 	ADDQ $2, DI
-	ADDQ $2, AX
-	JMP  emitLiteralEnd
+	ADDQ $2, DX
+	JMP  memmove

 oneByte:
 	SHLB $2, BX
 	MOVB BX, 0(DI)
 	ADDQ $1, DI
-	ADDQ $1, AX
+	ADDQ $1, DX

-emitLiteralEnd:
-	MOVQ AX, ret+48(FP)
+memmove:
+	MOVQ DX, ret+48(FP)

 	// copy(dst[i:], lit)
 	//
 	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
-	// DI, SI and CX as arguments.
+	// DI, R10 and AX as arguments.
 	MOVQ DI, 0(SP)
-	MOVQ SI, 8(SP)
-	MOVQ CX, 16(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
 	CALL runtime·memmove(SB)
 	RET

@@ -79,55 +90,59 @@ emitLiteralEnd:
 // func emitCopy(dst []byte, offset, length int) int
 //
 // All local variables fit into registers. The register allocation:
-//	- BX	offset
-//	- CX	length
+//	- AX	length
 //	- SI	&dst[0]
 //	- DI	&dst[i]
+//	- R11	offset
+//
+// The unusual register allocation of local variables, such as R11 for the
+// offset, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
 TEXT ·emitCopy(SB), NOSPLIT, $0-48
 	MOVQ dst_base+0(FP), DI
 	MOVQ DI, SI
-	MOVQ offset+24(FP), BX
-	MOVQ length+32(FP), CX
+	MOVQ offset+24(FP), R11
+	MOVQ length+32(FP), AX

 loop0:
 	// for length >= 68 { etc }
-	CMPL CX, $68
+	CMPL AX, $68
 	JLT  step1

 	// Emit a length 64 copy, encoded as 3 bytes.
 	MOVB $0xfe, 0(DI)
-	MOVW BX, 1(DI)
+	MOVW R11, 1(DI)
 	ADDQ $3, DI
-	SUBL $64, CX
+	SUBL $64, AX
 	JMP  loop0

 step1:
 	// if length > 64 { etc }
-	CMPL CX, $64
+	CMPL AX, $64
 	JLE  step2

 	// Emit a length 60 copy, encoded as 3 bytes.
 	MOVB $0xee, 0(DI)
-	MOVW BX, 1(DI)
+	MOVW R11, 1(DI)
 	ADDQ $3, DI
-	SUBL $60, CX
+	SUBL $60, AX

 step2:
 	// if length >= 12 || offset >= 2048 { goto step3 }
-	CMPL CX, $12
+	CMPL AX, $12
 	JGE  step3
-	CMPL BX, $2048
+	CMPL R11, $2048
 	JGE  step3

 	// Emit the remaining copy, encoded as 2 bytes.
-	MOVB BX, 1(DI)
-	SHRL $8, BX
-	SHLB $5, BX
-	SUBB $4, CX
-	SHLB $2, CX
-	ORB  CX, BX
-	ORB  $1, BX
-	MOVB BX, 0(DI)
+	MOVB R11, 1(DI)
+	SHRL $8, R11
+	SHLB $5, R11
+	SUBB $4, AX
+	SHLB $2, AX
+	ORB  AX, R11
+	ORB  $1, R11
+	MOVB R11, 0(DI)
 	ADDQ $2, DI

 	// Return the number of bytes written.
@@ -137,11 +152,11 @@ step2:

 step3:
 	// Emit the remaining copy, encoded as 3 bytes.
-	SUBL $1, CX
-	SHLB $2, CX
-	ORB  $2, CX
-	MOVB CX, 0(DI)
-	MOVW BX, 1(DI)
+	SUBL $1, AX
+	SHLB $2, AX
+	ORB  $2, AX
+	MOVB AX, 0(DI)
+	MOVW R11, 1(DI)
 	ADDQ $3, DI

 	// Return the number of bytes written.
@@ -154,33 +169,37 @@ step3:
 // func extendMatch(src []byte, i, j int) int
 //
 // All local variables fit into registers. The register allocation:
-//	- CX	&src[0]
-//	- DX	&src[len(src)]
-//	- SI	&src[i]
-//	- DI	&src[j]
-//	- R9	&src[len(src) - 8]
+//	- DX	&src[0]
+//	- SI	&src[j]
+//	- R13	&src[len(src) - 8]
+//	- R14	&src[len(src)]
+//	- R15	&src[i]
+//
+// The unusual register allocation of local variables, such as R15 for a source
+// pointer, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
 TEXT ·extendMatch(SB), NOSPLIT, $0-48
-	MOVQ src_base+0(FP), CX
-	MOVQ src_len+8(FP), DX
-	MOVQ i+24(FP), SI
-	MOVQ j+32(FP), DI
-	ADDQ CX, DX
-	ADDQ CX, SI
-	ADDQ CX, DI
-	MOVQ DX, R9
-	SUBQ $8, R9
+	MOVQ src_base+0(FP), DX
+	MOVQ src_len+8(FP), R14
+	MOVQ i+24(FP), R15
+	MOVQ j+32(FP), SI
+	ADDQ DX, R14
+	ADDQ DX, R15
+	ADDQ DX, SI
+	MOVQ R14, R13
+	SUBQ $8, R13

 cmp8:
 	// As long as we are 8 or more bytes before the end of src, we can load and
 	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
-	CMPQ DI, R9
+	CMPQ SI, R13
 	JA   cmp1
-	MOVQ (SI), AX
-	MOVQ (DI), BX
+	MOVQ (R15), AX
+	MOVQ (SI), BX
 	CMPQ AX, BX
 	JNE  bsf
+	ADDQ $8, R15
 	ADDQ $8, SI
-	ADDQ $8, DI
 	JMP  cmp8

 bsf:
@@ -191,29 +210,29 @@ bsf:
 	XORQ AX, BX
 	BSFQ BX, BX
 	SHRQ $3, BX
-	ADDQ BX, DI
+	ADDQ BX, SI

 	// Convert from &src[ret] to ret.
-	SUBQ CX, DI
-	MOVQ DI, ret+40(FP)
+	SUBQ DX, SI
+	MOVQ SI, ret+40(FP)
 	RET

 cmp1:
 	// In src's tail, compare 1 byte at a time.
-	CMPQ DI, DX
+	CMPQ SI, R14
 	JAE  extendMatchEnd
-	MOVB (SI), AX
-	MOVB (DI), BX
+	MOVB (R15), AX
+	MOVB (SI), BX
 	CMPB AX, BX
 	JNE  extendMatchEnd
+	ADDQ $1, R15
 	ADDQ $1, SI
-	ADDQ $1, DI
 	JMP  cmp1

 extendMatchEnd:
 	// Convert from &src[ret] to ret.
-	SUBQ CX, DI
-	MOVQ DI, ret+40(FP)
+	SUBQ DX, SI
+	MOVQ SI, ret+40(FP)
 	RET

 // ----------------------------------------------------------------------------
@@ -232,8 +251,8 @@ extendMatchEnd:
 //	- R10	.	&src[nextEmit]
 //	- R11	96	prevHash, currHash, nextHash, offset
 //	- R12	104	&src[base], skip
-//	- R13	.	&src[nextS]
-//	- R14	.	len(src), bytesBetweenHashLookups, x
+//	- R13	.	&src[nextS], &src[len(src) - 8]
+//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
 //	- R15	112	candidate
 //
 // The second column (56, 64, etc) is the stack offset to spill the registers
@@ -352,6 +371,7 @@ inner0:
 	// table[nextHash] = uint16(s)
 	MOVQ SI, AX
 	SUBQ DX, AX
+
 	// XXX: MOVW AX, table-32768(SP)(R11*2)
 	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
 	BYTE $0x66
@@ -384,32 +404,63 @@ fourByteMatch:
 	CMPQ AX, $16
 	JLE  emitLiteralFastPath

-	// d += emitLiteral(dst[d:], src[nextEmit:s])
+	// ----------------------------------------
+	// Begin inline of the emitLiteral call.
 	//
-	// Push args.
-	MOVQ DI, 0(SP)
-	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ R10, 24(SP)
-	MOVQ AX, 32(SP)
-	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
+	// d += emitLiteral(dst[d:], src[nextEmit:s])

+	MOVL AX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  inlineEmitLiteralOneByte
+	CMPL BX, $256
+	JLT  inlineEmitLiteralTwoBytes
+
+inlineEmitLiteralThreeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralTwoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralOneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
+
+inlineEmitLiteralMemmove:
 	// Spill local variables (registers) onto the stack; call; unspill.
+	//
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, R10 and AX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
+	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
 	MOVQ SI, 72(SP)
 	MOVQ DI, 80(SP)
 	MOVQ R15, 112(SP)
-	CALL ·emitLiteral(SB)
+	CALL runtime·memmove(SB)
 	MOVQ 56(SP), CX
 	MOVQ 64(SP), DX
 	MOVQ 72(SP), SI
 	MOVQ 80(SP), DI
 	MOVQ 88(SP), R9
 	MOVQ 112(SP), R15
-
-	// Finish the "d +=" part of "d += emitLiteral(etc)".
-	ADDQ 48(SP), DI
 	JMP  inner1

+inlineEmitLiteralEnd:
+	// End inline of the emitLiteral call.
+	// ----------------------------------------
+
 emitLiteralFastPath:
 	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
 	MOVB AX, BX
@@ -442,60 +493,129 @@ inner1:
 	SUBQ R15, R11
 	SUBQ DX, R11

+	// ----------------------------------------
+	// Begin inline of the extendMatch call.
+	//
 	// s = extendMatch(src, candidate+4, s+4)
-	//
-	// Push args.
-	MOVQ DX, 0(SP)
+
+	// !!! R14 = &src[len(src)]
 	MOVQ src_len+32(FP), R14
-	MOVQ R14, 8(SP)
-	MOVQ R14, 16(SP)         // Unnecessary, as the callee ignores it, but conservative.
+	ADDQ DX, R14
+
+	// !!! R13 = &src[len(src) - 8]
+	MOVQ R14, R13
+	SUBQ $8, R13
+
+	// !!! R15 = &src[candidate + 4]
 	ADDQ $4, R15
-	MOVQ R15, 24(SP)
+	ADDQ DX, R15
+
+	// !!! s += 4
 	ADDQ $4, SI
-	SUBQ DX, SI
-	MOVQ SI, 32(SP)

-	// Spill local variables (registers) onto the stack; call; unspill.
+inlineExtendMatchCmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ SI, R13
+	JA   inlineExtendMatchCmp1
+	MOVQ (R15), AX
+	MOVQ (SI), BX
+	CMPQ AX, BX
+	JNE  inlineExtendMatchBSF
+	ADDQ $8, R15
+	ADDQ $8, SI
+	JMP  inlineExtendMatchCmp8
+
+inlineExtendMatchBSF:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, SI
+	JMP  inlineExtendMatchEnd
+
+inlineExtendMatchCmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMPQ SI, R14
+	JAE  inlineExtendMatchEnd
+	MOVB (R15), AX
+	MOVB (SI), BX
+	CMPB AX, BX
+	JNE  inlineExtendMatchEnd
+	ADDQ $1, R15
+	ADDQ $1, SI
+	JMP  inlineExtendMatchCmp1
+
+inlineExtendMatchEnd:
+	// End inline of the extendMatch call.
+	// ----------------------------------------
+
+	// ----------------------------------------
+	// Begin inline of the emitCopy call.
 	//
-	// We don't need to unspill CX or R9 as we are just about to call another
-	// function.
-	MOVQ DI, 80(SP)
-	MOVQ R11, 96(SP)
-	MOVQ R12, 104(SP)
-	CALL ·extendMatch(SB)
-	MOVQ 64(SP), DX
-	MOVQ 80(SP), DI
-	MOVQ 96(SP), R11
-	MOVQ 104(SP), R12
-
-	// Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
-	// register holds &src[s], not s.
-	MOVQ 40(SP), SI
-	ADDQ DX, SI
-
 	// d += emitCopy(dst[d:], base-candidate, s-base)
-	//
-	// Push args.
-	MOVQ DI, 0(SP)
-	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
-	MOVQ R11, 24(SP)
+
+	// !!! length := s - base
 	MOVQ SI, AX
 	SUBQ R12, AX
-	MOVQ AX, 32(SP)

-	// Spill local variables (registers) onto the stack; call; unspill.
-	MOVQ SI, 72(SP)
-	MOVQ DI, 80(SP)
-	CALL ·emitCopy(SB)
-	MOVQ 56(SP), CX
-	MOVQ 64(SP), DX
-	MOVQ 72(SP), SI
-	MOVQ 80(SP), DI
-	MOVQ 88(SP), R9
+inlineEmitCopyLoop0:
+	// for length >= 68 { etc }
+	CMPL AX, $68
+	JLT  inlineEmitCopyStep1

-	// Finish the "d +=" part of "d += emitCopy(etc)".
-	ADDQ 40(SP), DI
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVB $0xfe, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $64, AX
+	JMP  inlineEmitCopyLoop0
+
+inlineEmitCopyStep1:
+	// if length > 64 { etc }
+	CMPL AX, $64
+	JLE  inlineEmitCopyStep2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVB $0xee, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $60, AX
+
+inlineEmitCopyStep2:
+	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
+	CMPL AX, $12
+	JGE  inlineEmitCopyStep3
+	CMPL R11, $2048
+	JGE  inlineEmitCopyStep3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(DI)
+	SHRL $8, R11
+	SHLB $5, R11
+	SUBB $4, AX
+	SHLB $2, AX
+	ORB  AX, R11
+	ORB  $1, R11
+	MOVB R11, 0(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitCopyEnd
+
+inlineEmitCopyStep3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBL $1, AX
+	SHLB $2, AX
+	ORB  $2, AX
+	MOVB AX, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+
+inlineEmitCopyEnd:
+	// End inline of the emitCopy call.
+	// ----------------------------------------

 	// nextEmit = s
 	MOVQ SI, R10
@@ -522,6 +642,7 @@ inner1:
 	MOVQ SI, AX
 	SUBQ DX, AX
 	SUBQ $1, AX
+
 	// XXX: MOVW AX, table-32768(SP)(R11*2)
 	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
 	BYTE $0x66
@@ -549,6 +670,7 @@ inner1:

 	// table[currHash] = uint16(s)
 	ADDQ $1, AX
+
 	// XXX: MOVW AX, table-32768(SP)(R11*2)
 	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
 	BYTE $0x66