Update vendoring

2025-11-03 03:58:01 +00:00 · 2016-04-26 00:18:04 +00:00
parent c26838e6da
commit 97810148f3
253 changed files with 14960 additions and 4479 deletions
--- a/vendor/github.com/golang/snappy/decode.go
+++ b/vendor/github.com/golang/snappy/decode.go
@@ -108,9 +108,9 @@ func (r *Reader) Reset(reader io.Reader) {
 	r.readHeader = false
 }

-func (r *Reader) readFull(p []byte) (ok bool) {
+func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
 	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
-		if r.err == io.ErrUnexpectedEOF {
+		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
 			r.err = ErrCorrupt
 		}
 		return false
@@ -129,7 +129,7 @@ func (r *Reader) Read(p []byte) (int, error) {
 			r.i += n
 			return n, nil
 		}
-		if !r.readFull(r.buf[:4]) {
+		if !r.readFull(r.buf[:4], true) {
 			return 0, r.err
 		}
 		chunkType := r.buf[0]
@@ -156,7 +156,7 @@ func (r *Reader) Read(p []byte) (int, error) {
 				return 0, r.err
 			}
 			buf := r.buf[:chunkLen]
-			if !r.readFull(buf) {
+			if !r.readFull(buf, false) {
 				return 0, r.err
 			}
 			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
@@ -189,13 +189,17 @@ func (r *Reader) Read(p []byte) (int, error) {
 				return 0, r.err
 			}
 			buf := r.buf[:checksumSize]
-			if !r.readFull(buf) {
+			if !r.readFull(buf, false) {
 				return 0, r.err
 			}
 			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
 			// Read directly into r.decoded instead of via r.buf.
 			n := chunkLen - checksumSize
-			if !r.readFull(r.decoded[:n]) {
+			if n > len(r.decoded) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.decoded[:n], false) {
 				return 0, r.err
 			}
 			if crc(r.decoded[:n]) != checksum {
@@ -211,7 +215,7 @@ func (r *Reader) Read(p []byte) (int, error) {
 				r.err = ErrCorrupt
 				return 0, r.err
 			}
-			if !r.readFull(r.buf[:len(magicBody)]) {
+			if !r.readFull(r.buf[:len(magicBody)], false) {
 				return 0, r.err
 			}
 			for i := 0; i < len(magicBody); i++ {
@@ -230,7 +234,7 @@ func (r *Reader) Read(p []byte) (int, error) {
 		}
 		// Section 4.4 Padding (chunk type 0xfe).
 		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
-		if !r.readFull(r.buf[:chunkLen]) {
+		if !r.readFull(r.buf[:chunkLen], false) {
 			return 0, r.err
 		}
 	}
--- a/vendor/github.com/golang/snappy/decode_amd64.go
+++ b/vendor/github.com/golang/snappy/decode_amd64.go
@@ -2,6 +2,10 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

+// +build !appengine
+// +build gc
+// +build !noasm
+
 package snappy

 // decode has the same semantics as in decode_other.go.
--- a/vendor/github.com/golang/snappy/decode_amd64.s
+++ b/vendor/github.com/golang/snappy/decode_amd64.s
@@ -2,12 +2,16 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

+// +build !appengine
+// +build gc
+// +build !noasm
+
 #include "textflag.h"

-// func decode(dst, src []byte) int
-//
 // The asm code generally follows the pure Go code in decode_other.go, except
 // where marked with a "!!!".
+
+// func decode(dst, src []byte) int
 //
 // All local variables fit into registers. The non-zero stack size is only to
 // spill registers and push args when issuing a CALL. The register allocation:
--- a/vendor/github.com/golang/snappy/decode_other.go
+++ b/vendor/github.com/golang/snappy/decode_other.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build !amd64
+// +build !amd64 appengine !gc noasm

 package snappy

--- a/vendor/github.com/golang/snappy/encode.go
+++ b/vendor/github.com/golang/snappy/encode.go
@@ -10,78 +10,11 @@ import (
 	"io"
 )

-// maxOffset limits how far copy back-references can go, the same as the C++
-// code.
-const maxOffset = 1 << 15
-
-// emitLiteral writes a literal chunk and returns the number of bytes written.
-func emitLiteral(dst, lit []byte) int {
-	i, n := 0, uint(len(lit)-1)
-	switch {
-	case n < 60:
-		dst[0] = uint8(n)<<2 | tagLiteral
-		i = 1
-	case n < 1<<8:
-		dst[0] = 60<<2 | tagLiteral
-		dst[1] = uint8(n)
-		i = 2
-	case n < 1<<16:
-		dst[0] = 61<<2 | tagLiteral
-		dst[1] = uint8(n)
-		dst[2] = uint8(n >> 8)
-		i = 3
-	case n < 1<<24:
-		dst[0] = 62<<2 | tagLiteral
-		dst[1] = uint8(n)
-		dst[2] = uint8(n >> 8)
-		dst[3] = uint8(n >> 16)
-		i = 4
-	case int64(n) < 1<<32:
-		dst[0] = 63<<2 | tagLiteral
-		dst[1] = uint8(n)
-		dst[2] = uint8(n >> 8)
-		dst[3] = uint8(n >> 16)
-		dst[4] = uint8(n >> 24)
-		i = 5
-	default:
-		panic("snappy: source buffer is too long")
-	}
-	if copy(dst[i:], lit) != len(lit) {
-		panic("snappy: destination buffer is too short")
-	}
-	return i + len(lit)
-}
-
-// emitCopy writes a copy chunk and returns the number of bytes written.
-func emitCopy(dst []byte, offset, length int32) int {
-	i := 0
-	for length > 0 {
-		x := length - 4
-		if 0 <= x && x < 1<<3 && offset < 1<<11 {
-			dst[i+0] = uint8(offset>>8)&0x07<<5 | uint8(x)<<2 | tagCopy1
-			dst[i+1] = uint8(offset)
-			i += 2
-			break
-		}
-
-		x = length
-		if x > 1<<6 {
-			x = 1 << 6
-		}
-		dst[i+0] = uint8(x-1)<<2 | tagCopy2
-		dst[i+1] = uint8(offset)
-		dst[i+2] = uint8(offset >> 8)
-		i += 3
-		length -= x
-	}
-	return i
-}
-
 // Encode returns the encoded form of src. The returned slice may be a sub-
 // slice of dst if dst was large enough to hold the entire encoded block.
 // Otherwise, a newly allocated slice will be returned.
 //
-// It is valid to pass a nil dst.
+// The dst and src must not overlap. It is valid to pass a nil dst.
 func Encode(dst, src []byte) []byte {
 	if n := MaxEncodedLen(len(src)); n < 0 {
 		panic(ErrTooLarge)
@@ -98,94 +31,43 @@ func Encode(dst, src []byte) []byte {
 		if len(p) > maxBlockSize {
 			p, src = p[:maxBlockSize], p[maxBlockSize:]
 		}
-		d += encodeBlock(dst[d:], p)
+		if len(p) < minNonLiteralBlockSize {
+			d += emitLiteral(dst[d:], p)
+		} else {
+			d += encodeBlock(dst[d:], p)
+		}
 	}
 	return dst[:d]
 }

-// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
-// assumes that the varint-encoded length of the decompressed bytes has already
-// been written.
+// inputMargin is the minimum number of extra input bytes to keep, inside
+// encodeBlock's inner loop. On some architectures, this margin lets us
+// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
+// literals can be implemented as a single load to and store from a 16-byte
+// register. That literal's actual length can be as short as 1 byte, so this
+// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
+// the encoding loop will fix up the copy overrun, and this inputMargin ensures
+// that we don't overrun the dst and src buffers.
+const inputMargin = 16 - 1
+
+// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
+// could be encoded with a copy tag. This is the minimum with respect to the
+// algorithm used by encodeBlock, not a minimum enforced by the file format.
 //
-// It also assumes that:
-//	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	0 < len(src) && len(src) <= maxBlockSize
-func encodeBlock(dst, src []byte) (d int) {
-	// Return early if src is short.
-	if len(src) <= 4 {
-		return emitLiteral(dst, src)
-	}
-
-	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
-	const maxTableSize = 1 << 14
-	shift, tableSize := uint(32-8), 1<<8
-	for tableSize < maxTableSize && tableSize < len(src) {
-		shift--
-		tableSize *= 2
-	}
-	var table [maxTableSize]int32
-
-	// Iterate over the source bytes.
-	var (
-		s   int32 // The iterator position.
-		t   int32 // The last position with the same hash as s.
-		lit int32 // The start position of any pending literal bytes.
-
-		// Copied from the C++ snappy implementation:
-		//
-		// Heuristic match skipping: If 32 bytes are scanned with no matches
-		// found, start looking only at every other byte. If 32 more bytes are
-		// scanned, look at every third byte, etc.. When a match is found,
-		// immediately go back to looking at every byte. This is a small loss
-		// (~5% performance, ~0.1% density) for compressible data due to more
-		// bookkeeping, but for non-compressible data (such as JPEG) it's a
-		// huge win since the compressor quickly "realizes" the data is
-		// incompressible and doesn't bother looking for matches everywhere.
-		//
-		// The "skip" variable keeps track of how many bytes there are since
-		// the last match; dividing it by 32 (ie. right-shifting by five) gives
-		// the number of bytes to move ahead for each iteration.
-		skip uint32 = 32
-	)
-	for uint32(s+3) < uint32(len(src)) { // The uint32 conversions catch overflow from the +3.
-		// Update the hash table.
-		b0, b1, b2, b3 := src[s], src[s+1], src[s+2], src[s+3]
-		h := uint32(b0) | uint32(b1)<<8 | uint32(b2)<<16 | uint32(b3)<<24
-		p := &table[(h*0x1e35a7bd)>>shift]
-		// We need to to store values in [-1, inf) in table. To save
-		// some initialization time, (re)use the table's zero value
-		// and shift the values against this zero: add 1 on writes,
-		// subtract 1 on reads.
-		t, *p = *p-1, s+1
-		// If t is invalid or src[s:s+4] differs from src[t:t+4], accumulate a literal byte.
-		if t < 0 || s-t >= maxOffset || b0 != src[t] || b1 != src[t+1] || b2 != src[t+2] || b3 != src[t+3] {
-			s += int32(skip >> 5)
-			skip++
-			continue
-		}
-		skip = 32
-		// Otherwise, we have a match. First, emit any pending literal bytes.
-		if lit != s {
-			d += emitLiteral(dst[d:], src[lit:s])
-		}
-		// Extend the match to be as long as possible.
-		s0 := s
-		s, t = s+4, t+4
-		for int(s) < len(src) && src[s] == src[t] {
-			s++
-			t++
-		}
-		// Emit the copied bytes.
-		d += emitCopy(dst[d:], s-t, s-s0)
-		lit = s
-	}
-
-	// Emit any final pending literal bytes and return.
-	if int(lit) != len(src) {
-		d += emitLiteral(dst[d:], src[lit:])
-	}
-	return d
-}
+// The encoded output must start with at least a 1 byte literal, as there are
+// no previous bytes to copy. A minimal (1 byte) copy after that, generated
+// from an emitCopy call in encodeBlock's main loop, would require at least
+// another inputMargin bytes, for the reason above: we want any emitLiteral
+// calls inside encodeBlock's main loop to use the fast path if possible, which
+// requires being able to overrun by inputMargin bytes. Thus,
+// minNonLiteralBlockSize equals 1 + 1 + inputMargin.
+//
+// The C++ code doesn't use this exact threshold, but it could, as discussed at
+// https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion
+// The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an
+// optimization. It should not affect the encoded form. This is tested by
+// TestSameEncodingAsCppShortCopies.
+const minNonLiteralBlockSize = 1 + 1 + inputMargin

 // MaxEncodedLen returns the maximum length of a snappy block, given its
 // uncompressed length.
--- a/vendor/github.com/golang/snappy/encode_amd64.go
+++ b/vendor/github.com/golang/snappy/encode_amd64.go
@@ -0,0 +1,29 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package snappy
+
+// emitLiteral has the same semantics as in encode_other.go.
+//
+//go:noescape
+func emitLiteral(dst, lit []byte) int
+
+// emitCopy has the same semantics as in encode_other.go.
+//
+//go:noescape
+func emitCopy(dst []byte, offset, length int) int
+
+// extendMatch has the same semantics as in encode_other.go.
+//
+//go:noescape
+func extendMatch(src []byte, i, j int) int
+
+// encodeBlock has the same semantics as in encode_other.go.
+//
+//go:noescape
+func encodeBlock(dst, src []byte) (d int)
--- a/vendor/github.com/golang/snappy/encode_amd64.s
+++ b/vendor/github.com/golang/snappy/encode_amd64.s
@@ -0,0 +1,608 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// TODO: figure out why the XXX lines compile with Go 1.4 and Go tip but not
+// Go 1.6.
+//
+// This is https://github.com/golang/snappy/issues/29
+
+// The asm code generally follows the pure Go code in encode_other.go, except
+// where marked with a "!!!".
+
+// ----------------------------------------------------------------------------
+
+// func emitLiteral(dst, lit []byte) int
+//
+// All local variables fit into registers. The register allocation:
+//	- AX	return value
+//	- BX	n
+//	- CX	len(lit)
+//	- SI	&lit[0]
+//	- DI	&dst[i]
+//
+// The 24 bytes of stack space is to call runtime·memmove.
+TEXT ·emitLiteral(SB), NOSPLIT, $24-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ lit_base+24(FP), SI
+	MOVQ lit_len+32(FP), CX
+	MOVQ CX, AX
+	MOVL CX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  oneByte
+	CMPL BX, $256
+	JLT  twoBytes
+
+threeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	ADDQ $3, AX
+	JMP  emitLiteralEnd
+
+twoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	ADDQ $2, AX
+	JMP  emitLiteralEnd
+
+oneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
+	ADDQ $1, AX
+
+emitLiteralEnd:
+	MOVQ AX, ret+48(FP)
+
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, SI and CX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ SI, 8(SP)
+	MOVQ CX, 16(SP)
+	CALL runtime·memmove(SB)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func emitCopy(dst []byte, offset, length int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- BX	offset
+//	- CX	length
+//	- SI	&dst[0]
+//	- DI	&dst[i]
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+	MOVQ dst_base+0(FP), DI
+	MOVQ DI, SI
+	MOVQ offset+24(FP), BX
+	MOVQ length+32(FP), CX
+
+loop0:
+	// for length >= 68 { etc }
+	CMPL CX, $68
+	JLT  step1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVB $0xfe, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	SUBL $64, CX
+	JMP  loop0
+
+step1:
+	// if length > 64 { etc }
+	CMPL CX, $64
+	JLE  step2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVB $0xee, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	SUBL $60, CX
+
+step2:
+	// if length >= 12 || offset >= 2048 { goto step3 }
+	CMPL CX, $12
+	JGE  step3
+	CMPL BX, $2048
+	JGE  step3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB BX, 1(DI)
+	SHRL $8, BX
+	SHLB $5, BX
+	SUBB $4, CX
+	SHLB $2, CX
+	ORB  CX, BX
+	ORB  $1, BX
+	MOVB BX, 0(DI)
+	ADDQ $2, DI
+
+	// Return the number of bytes written.
+	SUBQ SI, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+step3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBL $1, CX
+	SHLB $2, CX
+	ORB  $2, CX
+	MOVB CX, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+
+	// Return the number of bytes written.
+	SUBQ SI, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func extendMatch(src []byte, i, j int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- CX	&src[0]
+//	- DX	&src[len(src)]
+//	- SI	&src[i]
+//	- DI	&src[j]
+//	- R9	&src[len(src) - 8]
+TEXT ·extendMatch(SB), NOSPLIT, $0-48
+	MOVQ src_base+0(FP), CX
+	MOVQ src_len+8(FP), DX
+	MOVQ i+24(FP), SI
+	MOVQ j+32(FP), DI
+	ADDQ CX, DX
+	ADDQ CX, SI
+	ADDQ CX, DI
+	MOVQ DX, R9
+	SUBQ $8, R9
+
+cmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ DI, R9
+	JA   cmp1
+	MOVQ (SI), AX
+	MOVQ (DI), BX
+	CMPQ AX, BX
+	JNE  bsf
+	ADDQ $8, SI
+	ADDQ $8, DI
+	JMP  cmp8
+
+bsf:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, DI
+
+	// Convert from &src[ret] to ret.
+	SUBQ CX, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+cmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMPQ DI, DX
+	JAE  extendMatchEnd
+	MOVB (SI), AX
+	MOVB (DI), BX
+	CMPB AX, BX
+	JNE  extendMatchEnd
+	ADDQ $1, SI
+	ADDQ $1, DI
+	JMP  cmp1
+
+extendMatchEnd:
+	// Convert from &src[ret] to ret.
+	SUBQ CX, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func encodeBlock(dst, src []byte) (d int)
+//
+// All local variables fit into registers, other than "var table". The register
+// allocation:
+//	- AX	.	.
+//	- BX	.	.
+//	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
+//	- DX	64	&src[0], tableSize
+//	- SI	72	&src[s]
+//	- DI	80	&dst[d]
+//	- R9	88	sLimit
+//	- R10	.	&src[nextEmit]
+//	- R11	96	prevHash, currHash, nextHash, offset
+//	- R12	104	&src[base], skip
+//	- R13	.	&src[nextS]
+//	- R14	.	len(src), bytesBetweenHashLookups, x
+//	- R15	112	candidate
+//
+// The second column (56, 64, etc) is the stack offset to spill the registers
+// when calling other functions. We could pack this slightly tighter, but it's
+// simpler to have a dedicated spill map independent of the function called.
+//
+// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
+// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
+// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
+TEXT ·encodeBlock(SB), 0, $32888-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ src_base+24(FP), SI
+	MOVQ src_len+32(FP), R14
+
+	// shift, tableSize := uint32(32-8), 1<<8
+	MOVQ $24, CX
+	MOVQ $256, DX
+
+calcShift:
+	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+	//	shift--
+	// }
+	CMPQ DX, $16384
+	JGE  varTable
+	CMPQ DX, R14
+	JGE  varTable
+	SUBQ $1, CX
+	SHLQ $1, DX
+	JMP  calcShift
+
+varTable:
+	// var table [maxTableSize]uint16
+	//
+	// In the asm code, unlike the Go code, we can zero-initialize only the
+	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
+	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
+	// 2048 writes that would zero-initialize all of table's 32768 bytes.
+	SHRQ $3, DX
+	LEAQ table-32768(SP), BX
+	PXOR X0, X0
+
+memclr:
+	MOVOU X0, 0(BX)
+	ADDQ  $16, BX
+	SUBQ  $1, DX
+	JNZ   memclr
+
+	// !!! DX = &src[0]
+	MOVQ SI, DX
+
+	// sLimit := len(src) - inputMargin
+	MOVQ R14, R9
+	SUBQ $15, R9
+
+	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
+	// change for the rest of the function.
+	MOVQ CX, 56(SP)
+	MOVQ DX, 64(SP)
+	MOVQ R9, 88(SP)
+
+	// nextEmit := 0
+	MOVQ DX, R10
+
+	// s := 1
+	ADDQ $1, SI
+
+	// nextHash := hash(load32(src, s), shift)
+	MOVL  0(SI), R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+outer:
+	// for { etc }
+
+	// skip := 32
+	MOVQ $32, R12
+
+	// nextS := s
+	MOVQ SI, R13
+
+	// candidate := 0
+	MOVQ $0, R15
+
+inner0:
+	// for { etc }
+
+	// s := nextS
+	MOVQ R13, SI
+
+	// bytesBetweenHashLookups := skip >> 5
+	MOVQ R12, R14
+	SHRQ $5, R14
+
+	// nextS = s + bytesBetweenHashLookups
+	ADDQ R14, R13
+
+	// skip += bytesBetweenHashLookups
+	ADDQ R14, R12
+
+	// if nextS > sLimit { goto emitRemainder }
+	MOVQ R13, AX
+	SUBQ DX, AX
+	CMPQ AX, R9
+	JA   emitRemainder
+
+	// candidate = int(table[nextHash])
+	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
+	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+	BYTE $0x4e
+	BYTE $0x0f
+	BYTE $0xb7
+	BYTE $0x7c
+	BYTE $0x5c
+	BYTE $0x78
+
+	// table[nextHash] = uint16(s)
+	MOVQ SI, AX
+	SUBQ DX, AX
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// nextHash = hash(load32(src, nextS), shift)
+	MOVL  0(R13), R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// if load32(src, s) != load32(src, candidate) { continue } break
+	MOVL 0(SI), AX
+	MOVL (DX)(R15*1), BX
+	CMPL AX, BX
+	JNE  inner0
+
+fourByteMatch:
+	// As per the encode_other.go code:
+	//
+	// A 4-byte match has been found. We'll later see etc.
+
+	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
+	// on inputMargin in encode.go.
+	MOVQ SI, AX
+	SUBQ R10, AX
+	CMPQ AX, $16
+	JLE  emitLiteralFastPath
+
+	// d += emitLiteral(dst[d:], src[nextEmit:s])
+	//
+	// Push args.
+	MOVQ DI, 0(SP)
+	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ R10, 24(SP)
+	MOVQ AX, 32(SP)
+	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
+
+	// Spill local variables (registers) onto the stack; call; unspill.
+	MOVQ SI, 72(SP)
+	MOVQ DI, 80(SP)
+	MOVQ R15, 112(SP)
+	CALL ·emitLiteral(SB)
+	MOVQ 56(SP), CX
+	MOVQ 64(SP), DX
+	MOVQ 72(SP), SI
+	MOVQ 80(SP), DI
+	MOVQ 88(SP), R9
+	MOVQ 112(SP), R15
+
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	ADDQ 48(SP), DI
+	JMP  inner1
+
+emitLiteralFastPath:
+	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
+	MOVB AX, BX
+	SUBB $1, BX
+	SHLB $2, BX
+	MOVB BX, (DI)
+	ADDQ $1, DI
+
+	// !!! Implement the copy from lit to dst as a 16-byte load and store.
+	// (Encode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
+	// OK. Subsequent iterations will fix up the overrun.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(R10), X0
+	MOVOU X0, 0(DI)
+	ADDQ  AX, DI
+
+inner1:
+	// for { etc }
+
+	// base := s
+	MOVQ SI, R12
+
+	// !!! offset := base - candidate
+	MOVQ R12, R11
+	SUBQ R15, R11
+	SUBQ DX, R11
+
+	// s = extendMatch(src, candidate+4, s+4)
+	//
+	// Push args.
+	MOVQ DX, 0(SP)
+	MOVQ src_len+32(FP), R14
+	MOVQ R14, 8(SP)
+	MOVQ R14, 16(SP)         // Unnecessary, as the callee ignores it, but conservative.
+	ADDQ $4, R15
+	MOVQ R15, 24(SP)
+	ADDQ $4, SI
+	SUBQ DX, SI
+	MOVQ SI, 32(SP)
+
+	// Spill local variables (registers) onto the stack; call; unspill.
+	//
+	// We don't need to unspill CX or R9 as we are just about to call another
+	// function.
+	MOVQ DI, 80(SP)
+	MOVQ R11, 96(SP)
+	MOVQ R12, 104(SP)
+	CALL ·extendMatch(SB)
+	MOVQ 64(SP), DX
+	MOVQ 80(SP), DI
+	MOVQ 96(SP), R11
+	MOVQ 104(SP), R12
+
+	// Finish the "s =" part of "s = extendMatch(etc)", remembering that the SI
+	// register holds &src[s], not s.
+	MOVQ 40(SP), SI
+	ADDQ DX, SI
+
+	// d += emitCopy(dst[d:], base-candidate, s-base)
+	//
+	// Push args.
+	MOVQ DI, 0(SP)
+	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ R11, 24(SP)
+	MOVQ SI, AX
+	SUBQ R12, AX
+	MOVQ AX, 32(SP)
+
+	// Spill local variables (registers) onto the stack; call; unspill.
+	MOVQ SI, 72(SP)
+	MOVQ DI, 80(SP)
+	CALL ·emitCopy(SB)
+	MOVQ 56(SP), CX
+	MOVQ 64(SP), DX
+	MOVQ 72(SP), SI
+	MOVQ 80(SP), DI
+	MOVQ 88(SP), R9
+
+	// Finish the "d +=" part of "d += emitCopy(etc)".
+	ADDQ 40(SP), DI
+
+	// nextEmit = s
+	MOVQ SI, R10
+
+	// if s >= sLimit { goto emitRemainder }
+	MOVQ SI, AX
+	SUBQ DX, AX
+	CMPQ AX, R9
+	JAE  emitRemainder
+
+	// As per the encode_other.go code:
+	//
+	// We could immediately etc.
+
+	// x := load64(src, s-1)
+	MOVQ -1(SI), R14
+
+	// prevHash := hash(uint32(x>>0), shift)
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// table[prevHash] = uint16(s-1)
+	MOVQ SI, AX
+	SUBQ DX, AX
+	SUBQ $1, AX
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// currHash := hash(uint32(x>>8), shift)
+	SHRQ  $8, R14
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// candidate = int(table[currHash])
+	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
+	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+	BYTE $0x4e
+	BYTE $0x0f
+	BYTE $0xb7
+	BYTE $0x7c
+	BYTE $0x5c
+	BYTE $0x78
+
+	// table[currHash] = uint16(s)
+	ADDQ $1, AX
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// if uint32(x>>8) == load32(src, candidate) { continue }
+	MOVL (DX)(R15*1), BX
+	CMPL R14, BX
+	JEQ  inner1
+
+	// nextHash = hash(uint32(x>>16), shift)
+	SHRQ  $8, R14
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// s++
+	ADDQ $1, SI
+
+	// break out of the inner1 for loop, i.e. continue the outer loop.
+	JMP outer
+
+emitRemainder:
+	// if nextEmit < len(src) { etc }
+	MOVQ src_len+32(FP), AX
+	ADDQ DX, AX
+	CMPQ R10, AX
+	JEQ  encodeBlockEnd
+
+	// d += emitLiteral(dst[d:], src[nextEmit:])
+	//
+	// Push args.
+	MOVQ DI, 0(SP)
+	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ R10, 24(SP)
+	SUBQ R10, AX
+	MOVQ AX, 32(SP)
+	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
+
+	// Spill local variables (registers) onto the stack; call; unspill.
+	MOVQ DI, 80(SP)
+	CALL ·emitLiteral(SB)
+	MOVQ 80(SP), DI
+
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	ADDQ 48(SP), DI
+
+encodeBlockEnd:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, DI
+	MOVQ DI, d+48(FP)
+	RET
--- a/vendor/github.com/golang/snappy/encode_other.go
+++ b/vendor/github.com/golang/snappy/encode_other.go
@@ -0,0 +1,238 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 appengine !gc noasm
+
+package snappy
+
+func load32(b []byte, i int) uint32 {
+	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= len(lit) && len(lit) <= 65536
+func emitLiteral(dst, lit []byte) int {
+	i, n := 0, uint(len(lit)-1)
+	switch {
+	case n < 60:
+		dst[0] = uint8(n)<<2 | tagLiteral
+		i = 1
+	case n < 1<<8:
+		dst[0] = 60<<2 | tagLiteral
+		dst[1] = uint8(n)
+		i = 2
+	default:
+		dst[0] = 61<<2 | tagLiteral
+		dst[1] = uint8(n)
+		dst[2] = uint8(n >> 8)
+		i = 3
+	}
+	return i + copy(dst[i:], lit)
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= 65535
+//	4 <= length && length <= 65535
+func emitCopy(dst []byte, offset, length int) int {
+	i := 0
+	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
+	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
+	// length emitted down below is is a little lower (at 60 = 64 - 4), because
+	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
+	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
+	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
+	// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
+	// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
+	// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
+	for length >= 68 {
+		// Emit a length 64 copy, encoded as 3 bytes.
+		dst[i+0] = 63<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		i += 3
+		length -= 64
+	}
+	if length > 64 {
+		// Emit a length 60 copy, encoded as 3 bytes.
+		dst[i+0] = 59<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		i += 3
+		length -= 60
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		return i + 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	dst[i+1] = uint8(offset)
+	return i + 2
+}
+
+// extendMatch returns the largest k such that k <= len(src) and that
+// src[i:i+k-j] and src[j:k] have the same contents.
+//
+// It assumes that:
+//	0 <= i && i < j && j <= len(src)
+func extendMatch(src []byte, i, j int) int {
+	for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
+	}
+	return j
+}
+
+func hash(u, shift uint32) uint32 {
+	return (u * 0x1e35a7bd) >> shift
+}
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
+	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
+	// The table element type is uint16, as s < sLimit and sLimit < len(src)
+	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
+	const (
+		maxTableSize = 1 << 14
+		// tableMask is redundant, but helps the compiler eliminate bounds
+		// checks.
+		tableMask = maxTableSize - 1
+	)
+	shift := uint32(32 - 8)
+	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+		shift--
+	}
+	// In Go, all array elements are zero-initialized, so there is no advantage
+	// to a smaller tableSize per se. However, it matches the C++ algorithm,
+	// and in the asm versions of this code, we can get away with zeroing only
+	// the first tableSize elements.
+	var table [maxTableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	nextHash := hash(load32(src, s), shift)
+
+	for {
+		// Copied from the C++ snappy implementation:
+		//
+		// Heuristic match skipping: If 32 bytes are scanned with no matches
+		// found, start looking only at every other byte. If 32 more bytes are
+		// scanned (or skipped), look at every third byte, etc.. When a match
+		// is found, immediately go back to looking at every byte. This is a
+		// small loss (~5% performance, ~0.1% density) for compressible data
+		// due to more bookkeeping, but for non-compressible data (such as
+		// JPEG) it's a huge win since the compressor quickly "realizes" the
+		// data is incompressible and doesn't bother looking for matches
+		// everywhere.
+		//
+		// The "skip" variable keeps track of how many bytes there are since
+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
+		// the number of bytes to move ahead for each iteration.
+		skip := 32
+
+		nextS := s
+		candidate := 0
+		for {
+			s = nextS
+			bytesBetweenHashLookups := skip >> 5
+			nextS = s + bytesBetweenHashLookups
+			skip += bytesBetweenHashLookups
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = int(table[nextHash&tableMask])
+			table[nextHash&tableMask] = uint16(s)
+			nextHash = hash(load32(src, nextS), shift)
+			if load32(src, s) == load32(src, candidate) {
+				break
+			}
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+
+			// Extend the 4-byte match as long as possible.
+			//
+			// This is an inlined version of:
+			//	s = extendMatch(src, candidate+4, s+4)
+			s += 4
+			for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
+			}
+
+			d += emitCopy(dst[d:], base-candidate, s-base)
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-1 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load64(src, s-1)
+			prevHash := hash(uint32(x>>0), shift)
+			table[prevHash&tableMask] = uint16(s - 1)
+			currHash := hash(uint32(x>>8), shift)
+			candidate = int(table[currHash&tableMask])
+			table[currHash&tableMask] = uint16(s)
+			if uint32(x>>8) != load32(src, candidate) {
+				nextHash = hash(uint32(x>>16), shift)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}