mirror of https://github.com/grafana/grafana
Modules: Add goavro dependency for extensions (#20920)
parent
047abc87c2
commit
413be3a6a0
@ -0,0 +1,16 @@ |
||||
cmd/snappytool/snappytool |
||||
testdata/bench |
||||
|
||||
# These explicitly listed benchmark data files are for an obsolete version of |
||||
# snappy_test.go. |
||||
testdata/alice29.txt |
||||
testdata/asyoulik.txt |
||||
testdata/fireworks.jpeg |
||||
testdata/geo.protodata |
||||
testdata/html |
||||
testdata/html_x_4 |
||||
testdata/kppkn.gtb |
||||
testdata/lcet10.txt |
||||
testdata/paper-100k.pdf |
||||
testdata/plrabn12.txt |
||||
testdata/urls.10K |
@ -0,0 +1,15 @@ |
||||
# This is the official list of Snappy-Go authors for copyright purposes. |
||||
# This file is distinct from the CONTRIBUTORS files. |
||||
# See the latter for an explanation. |
||||
|
||||
# Names should be added to this file as |
||||
# Name or Organization <email address> |
||||
# The email address is not required for organizations. |
||||
|
||||
# Please keep the list sorted. |
||||
|
||||
Damian Gryski <dgryski@gmail.com> |
||||
Google Inc. |
||||
Jan Mercl <0xjnml@gmail.com> |
||||
Rodolfo Carvalho <rhcarvalho@gmail.com> |
||||
Sebastien Binet <seb.binet@gmail.com> |
@ -0,0 +1,37 @@ |
||||
# This is the official list of people who can contribute |
||||
# (and typically have contributed) code to the Snappy-Go repository. |
||||
# The AUTHORS file lists the copyright holders; this file |
||||
# lists people. For example, Google employees are listed here |
||||
# but not in AUTHORS, because Google holds the copyright. |
||||
# |
||||
# The submission process automatically checks to make sure |
||||
# that people submitting code are listed in this file (by email address). |
||||
# |
||||
# Names should be added to this file only after verifying that |
||||
# the individual or the individual's organization has agreed to |
||||
# the appropriate Contributor License Agreement, found here: |
||||
# |
||||
# http://code.google.com/legal/individual-cla-v1.0.html |
||||
# http://code.google.com/legal/corporate-cla-v1.0.html |
||||
# |
||||
# The agreement for individuals can be filled out on the web. |
||||
# |
||||
# When adding J Random Contributor's name to this file, |
||||
# either J's name or J's organization's name should be |
||||
# added to the AUTHORS file, depending on whether the |
||||
# individual or corporate CLA was used. |
||||
|
||||
# Names should be added to this file like so: |
||||
# Name <email address> |
||||
|
||||
# Please keep the list sorted. |
||||
|
||||
Damian Gryski <dgryski@gmail.com> |
||||
Jan Mercl <0xjnml@gmail.com> |
||||
Kai Backman <kaib@golang.org> |
||||
Marc-Antoine Ruel <maruel@chromium.org> |
||||
Nigel Tao <nigeltao@golang.org> |
||||
Rob Pike <r@golang.org> |
||||
Rodolfo Carvalho <rhcarvalho@gmail.com> |
||||
Russ Cox <rsc@golang.org> |
||||
Sebastien Binet <seb.binet@gmail.com> |
@ -0,0 +1,27 @@ |
||||
Copyright (c) 2011 The Snappy-Go Authors. All rights reserved. |
||||
|
||||
Redistribution and use in source and binary forms, with or without |
||||
modification, are permitted provided that the following conditions are |
||||
met: |
||||
|
||||
* Redistributions of source code must retain the above copyright |
||||
notice, this list of conditions and the following disclaimer. |
||||
* Redistributions in binary form must reproduce the above |
||||
copyright notice, this list of conditions and the following disclaimer |
||||
in the documentation and/or other materials provided with the |
||||
distribution. |
||||
* Neither the name of Google Inc. nor the names of its |
||||
contributors may be used to endorse or promote products derived from |
||||
this software without specific prior written permission. |
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
@ -0,0 +1,107 @@ |
||||
The Snappy compression format in the Go programming language. |
||||
|
||||
To download and install from source: |
||||
$ go get github.com/golang/snappy |
||||
|
||||
Unless otherwise noted, the Snappy-Go source files are distributed |
||||
under the BSD-style license found in the LICENSE file. |
||||
|
||||
|
||||
|
||||
Benchmarks. |
||||
|
||||
The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten |
||||
or so files, the same set used by the C++ Snappy code (github.com/google/snappy |
||||
and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @ |
||||
3.40GHz", Go's GOARCH=amd64 numbers as of 2016-05-29: |
||||
|
||||
"go test -test.bench=." |
||||
|
||||
_UFlat0-8 2.19GB/s ± 0% html |
||||
_UFlat1-8 1.41GB/s ± 0% urls |
||||
_UFlat2-8 23.5GB/s ± 2% jpg |
||||
_UFlat3-8 1.91GB/s ± 0% jpg_200 |
||||
_UFlat4-8 14.0GB/s ± 1% pdf |
||||
_UFlat5-8 1.97GB/s ± 0% html4 |
||||
_UFlat6-8 814MB/s ± 0% txt1 |
||||
_UFlat7-8 785MB/s ± 0% txt2 |
||||
_UFlat8-8 857MB/s ± 0% txt3 |
||||
_UFlat9-8 719MB/s ± 1% txt4 |
||||
_UFlat10-8 2.84GB/s ± 0% pb |
||||
_UFlat11-8 1.05GB/s ± 0% gaviota |
||||
|
||||
_ZFlat0-8 1.04GB/s ± 0% html |
||||
_ZFlat1-8 534MB/s ± 0% urls |
||||
_ZFlat2-8 15.7GB/s ± 1% jpg |
||||
_ZFlat3-8 740MB/s ± 3% jpg_200 |
||||
_ZFlat4-8 9.20GB/s ± 1% pdf |
||||
_ZFlat5-8 991MB/s ± 0% html4 |
||||
_ZFlat6-8 379MB/s ± 0% txt1 |
||||
_ZFlat7-8 352MB/s ± 0% txt2 |
||||
_ZFlat8-8 396MB/s ± 1% txt3 |
||||
_ZFlat9-8 327MB/s ± 1% txt4 |
||||
_ZFlat10-8 1.33GB/s ± 1% pb |
||||
_ZFlat11-8 605MB/s ± 1% gaviota |
||||
|
||||
|
||||
|
||||
"go test -test.bench=. -tags=noasm" |
||||
|
||||
_UFlat0-8 621MB/s ± 2% html |
||||
_UFlat1-8 494MB/s ± 1% urls |
||||
_UFlat2-8 23.2GB/s ± 1% jpg |
||||
_UFlat3-8 1.12GB/s ± 1% jpg_200 |
||||
_UFlat4-8 4.35GB/s ± 1% pdf |
||||
_UFlat5-8 609MB/s ± 0% html4 |
||||
_UFlat6-8 296MB/s ± 0% txt1 |
||||
_UFlat7-8 288MB/s ± 0% txt2 |
||||
_UFlat8-8 309MB/s ± 1% txt3 |
||||
_UFlat9-8 280MB/s ± 1% txt4 |
||||
_UFlat10-8 753MB/s ± 0% pb |
||||
_UFlat11-8 400MB/s ± 0% gaviota |
||||
|
||||
_ZFlat0-8 409MB/s ± 1% html |
||||
_ZFlat1-8 250MB/s ± 1% urls |
||||
_ZFlat2-8 12.3GB/s ± 1% jpg |
||||
_ZFlat3-8 132MB/s ± 0% jpg_200 |
||||
_ZFlat4-8 2.92GB/s ± 0% pdf |
||||
_ZFlat5-8 405MB/s ± 1% html4 |
||||
_ZFlat6-8 179MB/s ± 1% txt1 |
||||
_ZFlat7-8 170MB/s ± 1% txt2 |
||||
_ZFlat8-8 189MB/s ± 1% txt3 |
||||
_ZFlat9-8 164MB/s ± 1% txt4 |
||||
_ZFlat10-8 479MB/s ± 1% pb |
||||
_ZFlat11-8 270MB/s ± 1% gaviota |
||||
|
||||
|
||||
|
||||
For comparison (Go's encoded output is byte-for-byte identical to C++'s), here |
||||
are the numbers from C++ Snappy's |
||||
|
||||
make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log |
||||
|
||||
BM_UFlat/0 2.4GB/s html |
||||
BM_UFlat/1 1.4GB/s urls |
||||
BM_UFlat/2 21.8GB/s jpg |
||||
BM_UFlat/3 1.5GB/s jpg_200 |
||||
BM_UFlat/4 13.3GB/s pdf |
||||
BM_UFlat/5 2.1GB/s html4 |
||||
BM_UFlat/6 1.0GB/s txt1 |
||||
BM_UFlat/7 959.4MB/s txt2 |
||||
BM_UFlat/8 1.0GB/s txt3 |
||||
BM_UFlat/9 864.5MB/s txt4 |
||||
BM_UFlat/10 2.9GB/s pb |
||||
BM_UFlat/11 1.2GB/s gaviota |
||||
|
||||
BM_ZFlat/0 944.3MB/s html (22.31 %) |
||||
BM_ZFlat/1 501.6MB/s urls (47.78 %) |
||||
BM_ZFlat/2 14.3GB/s jpg (99.95 %) |
||||
BM_ZFlat/3 538.3MB/s jpg_200 (73.00 %) |
||||
BM_ZFlat/4 8.3GB/s pdf (83.30 %) |
||||
BM_ZFlat/5 903.5MB/s html4 (22.52 %) |
||||
BM_ZFlat/6 336.0MB/s txt1 (57.88 %) |
||||
BM_ZFlat/7 312.3MB/s txt2 (61.91 %) |
||||
BM_ZFlat/8 353.1MB/s txt3 (54.99 %) |
||||
BM_ZFlat/9 289.9MB/s txt4 (66.26 %) |
||||
BM_ZFlat/10 1.2GB/s pb (19.68 %) |
||||
BM_ZFlat/11 527.4MB/s gaviota (37.72 %) |
@ -0,0 +1,237 @@ |
||||
// Copyright 2011 The Snappy-Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package snappy |
||||
|
||||
import ( |
||||
"encoding/binary" |
||||
"errors" |
||||
"io" |
||||
) |
||||
|
||||
var ( |
||||
// ErrCorrupt reports that the input is invalid.
|
||||
ErrCorrupt = errors.New("snappy: corrupt input") |
||||
// ErrTooLarge reports that the uncompressed length is too large.
|
||||
ErrTooLarge = errors.New("snappy: decoded block is too large") |
||||
// ErrUnsupported reports that the input isn't supported.
|
||||
ErrUnsupported = errors.New("snappy: unsupported input") |
||||
|
||||
errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length") |
||||
) |
||||
|
||||
// DecodedLen returns the length of the decoded block.
|
||||
func DecodedLen(src []byte) (int, error) { |
||||
v, _, err := decodedLen(src) |
||||
return v, err |
||||
} |
||||
|
||||
// decodedLen returns the length of the decoded block and the number of bytes
|
||||
// that the length header occupied.
|
||||
func decodedLen(src []byte) (blockLen, headerLen int, err error) { |
||||
v, n := binary.Uvarint(src) |
||||
if n <= 0 || v > 0xffffffff { |
||||
return 0, 0, ErrCorrupt |
||||
} |
||||
|
||||
const wordSize = 32 << (^uint(0) >> 32 & 1) |
||||
if wordSize == 32 && v > 0x7fffffff { |
||||
return 0, 0, ErrTooLarge |
||||
} |
||||
return int(v), n, nil |
||||
} |
||||
|
||||
const ( |
||||
decodeErrCodeCorrupt = 1 |
||||
decodeErrCodeUnsupportedLiteralLength = 2 |
||||
) |
||||
|
||||
// Decode returns the decoded form of src. The returned slice may be a sub-
|
||||
// slice of dst if dst was large enough to hold the entire decoded block.
|
||||
// Otherwise, a newly allocated slice will be returned.
|
||||
//
|
||||
// The dst and src must not overlap. It is valid to pass a nil dst.
|
||||
func Decode(dst, src []byte) ([]byte, error) { |
||||
dLen, s, err := decodedLen(src) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
if dLen <= len(dst) { |
||||
dst = dst[:dLen] |
||||
} else { |
||||
dst = make([]byte, dLen) |
||||
} |
||||
switch decode(dst, src[s:]) { |
||||
case 0: |
||||
return dst, nil |
||||
case decodeErrCodeUnsupportedLiteralLength: |
||||
return nil, errUnsupportedLiteralLength |
||||
} |
||||
return nil, ErrCorrupt |
||||
} |
||||
|
||||
// NewReader returns a new Reader that decompresses from r, using the framing
|
||||
// format described at
|
||||
// https://github.com/google/snappy/blob/master/framing_format.txt
|
||||
func NewReader(r io.Reader) *Reader { |
||||
return &Reader{ |
||||
r: r, |
||||
decoded: make([]byte, maxBlockSize), |
||||
buf: make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize), |
||||
} |
||||
} |
||||
|
||||
// Reader is an io.Reader that can read Snappy-compressed bytes.
|
||||
type Reader struct { |
||||
r io.Reader |
||||
err error |
||||
decoded []byte |
||||
buf []byte |
||||
// decoded[i:j] contains decoded bytes that have not yet been passed on.
|
||||
i, j int |
||||
readHeader bool |
||||
} |
||||
|
||||
// Reset discards any buffered data, resets all state, and switches the Snappy
|
||||
// reader to read from r. This permits reusing a Reader rather than allocating
|
||||
// a new one.
|
||||
func (r *Reader) Reset(reader io.Reader) { |
||||
r.r = reader |
||||
r.err = nil |
||||
r.i = 0 |
||||
r.j = 0 |
||||
r.readHeader = false |
||||
} |
||||
|
||||
func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) { |
||||
if _, r.err = io.ReadFull(r.r, p); r.err != nil { |
||||
if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) { |
||||
r.err = ErrCorrupt |
||||
} |
||||
return false |
||||
} |
||||
return true |
||||
} |
||||
|
||||
// Read satisfies the io.Reader interface.
|
||||
func (r *Reader) Read(p []byte) (int, error) { |
||||
if r.err != nil { |
||||
return 0, r.err |
||||
} |
||||
for { |
||||
if r.i < r.j { |
||||
n := copy(p, r.decoded[r.i:r.j]) |
||||
r.i += n |
||||
return n, nil |
||||
} |
||||
if !r.readFull(r.buf[:4], true) { |
||||
return 0, r.err |
||||
} |
||||
chunkType := r.buf[0] |
||||
if !r.readHeader { |
||||
if chunkType != chunkTypeStreamIdentifier { |
||||
r.err = ErrCorrupt |
||||
return 0, r.err |
||||
} |
||||
r.readHeader = true |
||||
} |
||||
chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16 |
||||
if chunkLen > len(r.buf) { |
||||
r.err = ErrUnsupported |
||||
return 0, r.err |
||||
} |
||||
|
||||
// The chunk types are specified at
|
||||
// https://github.com/google/snappy/blob/master/framing_format.txt
|
||||
switch chunkType { |
||||
case chunkTypeCompressedData: |
||||
// Section 4.2. Compressed data (chunk type 0x00).
|
||||
if chunkLen < checksumSize { |
||||
r.err = ErrCorrupt |
||||
return 0, r.err |
||||
} |
||||
buf := r.buf[:chunkLen] |
||||
if !r.readFull(buf, false) { |
||||
return 0, r.err |
||||
} |
||||
checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 |
||||
buf = buf[checksumSize:] |
||||
|
||||
n, err := DecodedLen(buf) |
||||
if err != nil { |
||||
r.err = err |
||||
return 0, r.err |
||||
} |
||||
if n > len(r.decoded) { |
||||
r.err = ErrCorrupt |
||||
return 0, r.err |
||||
} |
||||
if _, err := Decode(r.decoded, buf); err != nil { |
||||
r.err = err |
||||
return 0, r.err |
||||
} |
||||
if crc(r.decoded[:n]) != checksum { |
||||
r.err = ErrCorrupt |
||||
return 0, r.err |
||||
} |
||||
r.i, r.j = 0, n |
||||
continue |
||||
|
||||
case chunkTypeUncompressedData: |
||||
// Section 4.3. Uncompressed data (chunk type 0x01).
|
||||
if chunkLen < checksumSize { |
||||
r.err = ErrCorrupt |
||||
return 0, r.err |
||||
} |
||||
buf := r.buf[:checksumSize] |
||||
if !r.readFull(buf, false) { |
||||
return 0, r.err |
||||
} |
||||
checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24 |
||||
// Read directly into r.decoded instead of via r.buf.
|
||||
n := chunkLen - checksumSize |
||||
if n > len(r.decoded) { |
||||
r.err = ErrCorrupt |
||||
return 0, r.err |
||||
} |
||||
if !r.readFull(r.decoded[:n], false) { |
||||
return 0, r.err |
||||
} |
||||
if crc(r.decoded[:n]) != checksum { |
||||
r.err = ErrCorrupt |
||||
return 0, r.err |
||||
} |
||||
r.i, r.j = 0, n |
||||
continue |
||||
|
||||
case chunkTypeStreamIdentifier: |
||||
// Section 4.1. Stream identifier (chunk type 0xff).
|
||||
if chunkLen != len(magicBody) { |
||||
r.err = ErrCorrupt |
||||
return 0, r.err |
||||
} |
||||
if !r.readFull(r.buf[:len(magicBody)], false) { |
||||
return 0, r.err |
||||
} |
||||
for i := 0; i < len(magicBody); i++ { |
||||
if r.buf[i] != magicBody[i] { |
||||
r.err = ErrCorrupt |
||||
return 0, r.err |
||||
} |
||||
} |
||||
continue |
||||
} |
||||
|
||||
if chunkType <= 0x7f { |
||||
// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
|
||||
r.err = ErrUnsupported |
||||
return 0, r.err |
||||
} |
||||
// Section 4.4 Padding (chunk type 0xfe).
|
||||
// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
|
||||
if !r.readFull(r.buf[:chunkLen], false) { |
||||
return 0, r.err |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,14 @@ |
||||
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !appengine
|
||||
// +build gc
|
||||
// +build !noasm
|
||||
|
||||
package snappy |
||||
|
||||
// decode has the same semantics as in decode_other.go.
|
||||
//
|
||||
//go:noescape
|
||||
func decode(dst, src []byte) int |
@ -0,0 +1,490 @@ |
||||
// Copyright 2016 The Go Authors. All rights reserved. |
||||
// Use of this source code is governed by a BSD-style |
||||
// license that can be found in the LICENSE file. |
||||
|
||||
// +build !appengine |
||||
// +build gc |
||||
// +build !noasm |
||||
|
||||
#include "textflag.h" |
||||
|
||||
// The asm code generally follows the pure Go code in decode_other.go, except |
||||
// where marked with a "!!!". |
||||
|
||||
// func decode(dst, src []byte) int |
||||
// |
||||
// All local variables fit into registers. The non-zero stack size is only to |
||||
// spill registers and push args when issuing a CALL. The register allocation: |
||||
// - AX scratch |
||||
// - BX scratch |
||||
// - CX length or x |
||||
// - DX offset |
||||
// - SI &src[s] |
||||
// - DI &dst[d] |
||||
// + R8 dst_base |
||||
// + R9 dst_len |
||||
// + R10 dst_base + dst_len |
||||
// + R11 src_base |
||||
// + R12 src_len |
||||
// + R13 src_base + src_len |
||||
// - R14 used by doCopy |
||||
// - R15 used by doCopy |
||||
// |
||||
// The registers R8-R13 (marked with a "+") are set at the start of the |
||||
// function, and after a CALL returns, and are not otherwise modified. |
||||
// |
||||
// The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI. |
||||
// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI. |
||||
TEXT ·decode(SB), NOSPLIT, $48-56 |
||||
// Initialize SI, DI and R8-R13. |
||||
MOVQ dst_base+0(FP), R8 |
||||
MOVQ dst_len+8(FP), R9 |
||||
MOVQ R8, DI |
||||
MOVQ R8, R10 |
||||
ADDQ R9, R10 |
||||
MOVQ src_base+24(FP), R11 |
||||
MOVQ src_len+32(FP), R12 |
||||
MOVQ R11, SI |
||||
MOVQ R11, R13 |
||||
ADDQ R12, R13 |
||||
|
||||
loop: |
||||
// for s < len(src) |
||||
CMPQ SI, R13 |
||||
JEQ end |
||||
|
||||
// CX = uint32(src[s]) |
||||
// |
||||
// switch src[s] & 0x03 |
||||
MOVBLZX (SI), CX |
||||
MOVL CX, BX |
||||
ANDL $3, BX |
||||
CMPL BX, $1 |
||||
JAE tagCopy |
||||
|
||||
// ---------------------------------------- |
||||
// The code below handles literal tags. |
||||
|
||||
// case tagLiteral: |
||||
// x := uint32(src[s] >> 2) |
||||
// switch |
||||
SHRL $2, CX |
||||
CMPL CX, $60 |
||||
JAE tagLit60Plus |
||||
|
||||
// case x < 60: |
||||
// s++ |
||||
INCQ SI |
||||
|
||||
doLit: |
||||
// This is the end of the inner "switch", when we have a literal tag. |
||||
// |
||||
// We assume that CX == x and x fits in a uint32, where x is the variable |
||||
// used in the pure Go decode_other.go code. |
||||
|
||||
// length = int(x) + 1 |
||||
// |
||||
// Unlike the pure Go code, we don't need to check if length <= 0 because |
||||
// CX can hold 64 bits, so the increment cannot overflow. |
||||
INCQ CX |
||||
|
||||
// Prepare to check if copying length bytes will run past the end of dst or |
||||
// src. |
||||
// |
||||
// AX = len(dst) - d |
||||
// BX = len(src) - s |
||||
MOVQ R10, AX |
||||
SUBQ DI, AX |
||||
MOVQ R13, BX |
||||
SUBQ SI, BX |
||||
|
||||
// !!! Try a faster technique for short (16 or fewer bytes) copies. |
||||
// |
||||
// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 { |
||||
// goto callMemmove // Fall back on calling runtime·memmove. |
||||
// } |
||||
// |
||||
// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s |
||||
// against 21 instead of 16, because it cannot assume that all of its input |
||||
// is contiguous in memory and so it needs to leave enough source bytes to |
||||
// read the next tag without refilling buffers, but Go's Decode assumes |
||||
// contiguousness (the src argument is a []byte). |
||||
CMPQ CX, $16 |
||||
JGT callMemmove |
||||
CMPQ AX, $16 |
||||
JLT callMemmove |
||||
CMPQ BX, $16 |
||||
JLT callMemmove |
||||
|
||||
// !!! Implement the copy from src to dst as a 16-byte load and store. |
||||
// (Decode's documentation says that dst and src must not overlap.) |
||||
// |
||||
// This always copies 16 bytes, instead of only length bytes, but that's |
||||
// OK. If the input is a valid Snappy encoding then subsequent iterations |
||||
// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a |
||||
// non-nil error), so the overrun will be ignored. |
||||
// |
||||
// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or |
||||
// 16-byte loads and stores. This technique probably wouldn't be as |
||||
// effective on architectures that are fussier about alignment. |
||||
MOVOU 0(SI), X0 |
||||
MOVOU X0, 0(DI) |
||||
|
||||
// d += length |
||||
// s += length |
||||
ADDQ CX, DI |
||||
ADDQ CX, SI |
||||
JMP loop |
||||
|
||||
callMemmove: |
||||
// if length > len(dst)-d || length > len(src)-s { etc } |
||||
CMPQ CX, AX |
||||
JGT errCorrupt |
||||
CMPQ CX, BX |
||||
JGT errCorrupt |
||||
|
||||
// copy(dst[d:], src[s:s+length]) |
||||
// |
||||
// This means calling runtime·memmove(&dst[d], &src[s], length), so we push |
||||
// DI, SI and CX as arguments. Coincidentally, we also need to spill those |
||||
// three registers to the stack, to save local variables across the CALL. |
||||
MOVQ DI, 0(SP) |
||||
MOVQ SI, 8(SP) |
||||
MOVQ CX, 16(SP) |
||||
MOVQ DI, 24(SP) |
||||
MOVQ SI, 32(SP) |
||||
MOVQ CX, 40(SP) |
||||
CALL runtime·memmove(SB) |
||||
|
||||
// Restore local variables: unspill registers from the stack and |
||||
// re-calculate R8-R13. |
||||
MOVQ 24(SP), DI |
||||
MOVQ 32(SP), SI |
||||
MOVQ 40(SP), CX |
||||
MOVQ dst_base+0(FP), R8 |
||||
MOVQ dst_len+8(FP), R9 |
||||
MOVQ R8, R10 |
||||
ADDQ R9, R10 |
||||
MOVQ src_base+24(FP), R11 |
||||
MOVQ src_len+32(FP), R12 |
||||
MOVQ R11, R13 |
||||
ADDQ R12, R13 |
||||
|
||||
// d += length |
||||
// s += length |
||||
ADDQ CX, DI |
||||
ADDQ CX, SI |
||||
JMP loop |
||||
|
||||
tagLit60Plus: |
||||
// !!! This fragment does the |
||||
// |
||||
// s += x - 58; if uint(s) > uint(len(src)) { etc }
|
||||
// |
||||
// checks. In the asm version, we code it once instead of once per switch case. |
||||
ADDQ CX, SI |
||||
SUBQ $58, SI |
||||
MOVQ SI, BX |
||||
SUBQ R11, BX |
||||
CMPQ BX, R12 |
||||
JA errCorrupt |
||||
|
||||
// case x == 60: |
||||
CMPL CX, $61 |
||||
JEQ tagLit61 |
||||
JA tagLit62Plus |
||||
|
||||
// x = uint32(src[s-1]) |
||||
MOVBLZX -1(SI), CX |
||||
JMP doLit |
||||
|
||||
tagLit61: |
||||
// case x == 61: |
||||
// x = uint32(src[s-2]) | uint32(src[s-1])<<8 |
||||
MOVWLZX -2(SI), CX |
||||
JMP doLit |
||||
|
||||
tagLit62Plus: |
||||
CMPL CX, $62 |
||||
JA tagLit63 |
||||
|
||||
// case x == 62: |
||||
// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 |
||||
MOVWLZX -3(SI), CX |
||||
MOVBLZX -1(SI), BX |
||||
SHLL $16, BX |
||||
ORL BX, CX |
||||
JMP doLit |
||||
|
||||
tagLit63: |
||||
// case x == 63: |
||||
// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 |
||||
MOVL -4(SI), CX |
||||
JMP doLit |
||||
|
||||
// The code above handles literal tags. |
||||
// ---------------------------------------- |
||||
// The code below handles copy tags. |
||||
|
||||
tagCopy4: |
||||
// case tagCopy4: |
||||
// s += 5 |
||||
ADDQ $5, SI |
||||
|
||||
// if uint(s) > uint(len(src)) { etc } |
||||
MOVQ SI, BX |
||||
SUBQ R11, BX |
||||
CMPQ BX, R12 |
||||
JA errCorrupt |
||||
|
||||
// length = 1 + int(src[s-5])>>2 |
||||
SHRQ $2, CX |
||||
INCQ CX |
||||
|
||||
// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) |
||||
MOVLQZX -4(SI), DX |
||||
JMP doCopy |
||||
|
||||
tagCopy2: |
||||
// case tagCopy2: |
||||
// s += 3 |
||||
ADDQ $3, SI |
||||
|
||||
// if uint(s) > uint(len(src)) { etc } |
||||
MOVQ SI, BX |
||||
SUBQ R11, BX |
||||
CMPQ BX, R12 |
||||
JA errCorrupt |
||||
|
||||
// length = 1 + int(src[s-3])>>2 |
||||
SHRQ $2, CX |
||||
INCQ CX |
||||
|
||||
// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) |
||||
MOVWQZX -2(SI), DX |
||||
JMP doCopy |
||||
|
||||
tagCopy: |
||||
// We have a copy tag. We assume that: |
||||
// - BX == src[s] & 0x03 |
||||
// - CX == src[s] |
||||
CMPQ BX, $2 |
||||
JEQ tagCopy2 |
||||
JA tagCopy4 |
||||
|
||||
// case tagCopy1: |
||||
// s += 2 |
||||
ADDQ $2, SI |
||||
|
||||
// if uint(s) > uint(len(src)) { etc } |
||||
MOVQ SI, BX |
||||
SUBQ R11, BX |
||||
CMPQ BX, R12 |
||||
JA errCorrupt |
||||
|
||||
// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) |
||||
MOVQ CX, DX |
||||
ANDQ $0xe0, DX |
||||
SHLQ $3, DX |
||||
MOVBQZX -1(SI), BX |
||||
ORQ BX, DX |
||||
|
||||
// length = 4 + int(src[s-2])>>2&0x7 |
||||
SHRQ $2, CX |
||||
ANDQ $7, CX |
||||
ADDQ $4, CX |
||||
|
||||
doCopy: |
||||
// This is the end of the outer "switch", when we have a copy tag. |
||||
// |
||||
// We assume that: |
||||
// - CX == length && CX > 0 |
||||
// - DX == offset |
||||
|
||||
// if offset <= 0 { etc } |
||||
CMPQ DX, $0 |
||||
JLE errCorrupt |
||||
|
||||
// if d < offset { etc } |
||||
MOVQ DI, BX |
||||
SUBQ R8, BX |
||||
CMPQ BX, DX |
||||
JLT errCorrupt |
||||
|
||||
// if length > len(dst)-d { etc } |
||||
MOVQ R10, BX |
||||
SUBQ DI, BX |
||||
CMPQ CX, BX |
||||
JGT errCorrupt |
||||
|
||||
// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
|
||||
// |
||||
// Set: |
||||
// - R14 = len(dst)-d |
||||
// - R15 = &dst[d-offset] |
||||
MOVQ R10, R14 |
||||
SUBQ DI, R14 |
||||
MOVQ DI, R15 |
||||
SUBQ DX, R15 |
||||
|
||||
// !!! Try a faster technique for short (16 or fewer bytes) forward copies. |
||||
// |
||||
// First, try using two 8-byte load/stores, similar to the doLit technique |
||||
// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is |
||||
// still OK if offset >= 8. Note that this has to be two 8-byte load/stores |
||||
// and not one 16-byte load/store, and the first store has to be before the |
||||
// second load, due to the overlap if offset is in the range [8, 16). |
||||
// |
||||
// if length > 16 || offset < 8 || len(dst)-d < 16 { |
||||
// goto slowForwardCopy |
||||
// } |
||||
// copy 16 bytes |
||||
// d += length |
||||
CMPQ CX, $16 |
||||
JGT slowForwardCopy |
||||
CMPQ DX, $8 |
||||
JLT slowForwardCopy |
||||
CMPQ R14, $16 |
||||
JLT slowForwardCopy |
||||
MOVQ 0(R15), AX |
||||
MOVQ AX, 0(DI) |
||||
MOVQ 8(R15), BX |
||||
MOVQ BX, 8(DI) |
||||
ADDQ CX, DI |
||||
JMP loop |
||||
|
||||
slowForwardCopy: |
||||
// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we |
||||
// can still try 8-byte load stores, provided we can overrun up to 10 extra |
||||
// bytes. As above, the overrun will be fixed up by subsequent iterations |
||||
// of the outermost loop. |
||||
// |
||||
// The C++ snappy code calls this technique IncrementalCopyFastPath. Its |
||||
// commentary says: |
||||
// |
||||
// ---- |
||||
// |
||||
// The main part of this loop is a simple copy of eight bytes at a time |
||||
// until we've copied (at least) the requested amount of bytes. However, |
||||
// if d and d-offset are less than eight bytes apart (indicating a |
||||
// repeating pattern of length < 8), we first need to expand the pattern in |
||||
// order to get the correct results. For instance, if the buffer looks like |
||||
// this, with the eight-byte <d-offset> and <d> patterns marked as |
||||
// intervals: |
||||
// |
||||
// abxxxxxxxxxxxx |
||||
// [------] d-offset |
||||
// [------] d |
||||
// |
||||
// a single eight-byte copy from <d-offset> to <d> will repeat the pattern |
||||
// once, after which we can move <d> two bytes without moving <d-offset>: |
||||
// |
||||
// ababxxxxxxxxxx |
||||
// [------] d-offset |
||||
// [------] d |
||||
// |
||||
// and repeat the exercise until the two no longer overlap. |
||||
// |
||||
// This allows us to do very well in the special case of one single byte |
||||
// repeated many times, without taking a big hit for more general cases. |
||||
// |
||||
// The worst case of extra writing past the end of the match occurs when |
||||
// offset == 1 and length == 1; the last copy will read from byte positions
|
||||
// [0..7] and write to [4..11], whereas it was only supposed to write to |
||||
// position 1. Thus, ten excess bytes. |
||||
// |
||||
// ---- |
||||
// |
||||
// That "10 byte overrun" worst case is confirmed by Go's |
||||
// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy |
||||
// and finishSlowForwardCopy algorithm. |
||||
// |
||||
// if length > len(dst)-d-10 { |
||||
// goto verySlowForwardCopy |
||||
// } |
||||
SUBQ $10, R14 |
||||
CMPQ CX, R14 |
||||
JGT verySlowForwardCopy |
||||
|
||||
makeOffsetAtLeast8: |
||||
// !!! As above, expand the pattern so that offset >= 8 and we can use |
||||
// 8-byte load/stores. |
||||
// |
||||
// for offset < 8 { |
||||
// copy 8 bytes from dst[d-offset:] to dst[d:] |
||||
// length -= offset |
||||
// d += offset |
||||
// offset += offset |
||||
// // The two previous lines together means that d-offset, and therefore |
||||
// // R15, is unchanged. |
||||
// } |
||||
CMPQ DX, $8 |
||||
JGE fixUpSlowForwardCopy |
||||
MOVQ (R15), BX |
||||
MOVQ BX, (DI) |
||||
SUBQ DX, CX |
||||
ADDQ DX, DI |
||||
ADDQ DX, DX |
||||
JMP makeOffsetAtLeast8 |
||||
|
||||
fixUpSlowForwardCopy: |
||||
// !!! Add length (which might be negative now) to d (implied by DI being |
||||
// &dst[d]) so that d ends up at the right place when we jump back to the |
||||
// top of the loop. Before we do that, though, we save DI to AX so that, if |
||||
// length is positive, copying the remaining length bytes will write to the |
||||
// right place. |
||||
MOVQ DI, AX |
||||
ADDQ CX, DI |
||||
|
||||
finishSlowForwardCopy: |
||||
// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative |
||||
// length means that we overrun, but as above, that will be fixed up by |
||||
// subsequent iterations of the outermost loop. |
||||
CMPQ CX, $0 |
||||
JLE loop |
||||
MOVQ (R15), BX |
||||
MOVQ BX, (AX) |
||||
ADDQ $8, R15 |
||||
ADDQ $8, AX |
||||
SUBQ $8, CX |
||||
JMP finishSlowForwardCopy |
||||
|
||||
verySlowForwardCopy: |
||||
// verySlowForwardCopy is a simple implementation of forward copy. In C |
||||
// parlance, this is a do/while loop instead of a while loop, since we know |
||||
// that length > 0. In Go syntax: |
||||
// |
||||
// for { |
||||
// dst[d] = dst[d - offset] |
||||
// d++ |
||||
// length-- |
||||
// if length == 0 { |
||||
// break |
||||
// } |
||||
// } |
||||
MOVB (R15), BX |
||||
MOVB BX, (DI) |
||||
INCQ R15 |
||||
INCQ DI |
||||
DECQ CX |
||||
JNZ verySlowForwardCopy |
||||
JMP loop |
||||
|
||||
// The code above handles copy tags. |
||||
// ---------------------------------------- |
||||
|
||||
end: |
||||
// This is the end of the "for s < len(src)". |
||||
// |
||||
// if d != len(dst) { etc } |
||||
CMPQ DI, R10 |
||||
JNE errCorrupt |
||||
|
||||
// return 0 |
||||
MOVQ $0, ret+48(FP) |
||||
RET |
||||
|
||||
errCorrupt: |
||||
// return decodeErrCodeCorrupt |
||||
MOVQ $1, ret+48(FP) |
||||
RET |
@ -0,0 +1,101 @@ |
||||
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64 appengine !gc noasm
|
||||
|
||||
package snappy |
||||
|
||||
// decode writes the decoding of src to dst. It assumes that the varint-encoded
|
||||
// length of the decompressed bytes has already been read, and that len(dst)
|
||||
// equals that length.
|
||||
//
|
||||
// It returns 0 on success or a decodeErrCodeXxx error code on failure.
|
||||
func decode(dst, src []byte) int { |
||||
var d, s, offset, length int |
||||
for s < len(src) { |
||||
switch src[s] & 0x03 { |
||||
case tagLiteral: |
||||
x := uint32(src[s] >> 2) |
||||
switch { |
||||
case x < 60: |
||||
s++ |
||||
case x == 60: |
||||
s += 2 |
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt |
||||
} |
||||
x = uint32(src[s-1]) |
||||
case x == 61: |
||||
s += 3 |
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt |
||||
} |
||||
x = uint32(src[s-2]) | uint32(src[s-1])<<8 |
||||
case x == 62: |
||||
s += 4 |
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt |
||||
} |
||||
x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16 |
||||
case x == 63: |
||||
s += 5 |
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt |
||||
} |
||||
x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24 |
||||
} |
||||
length = int(x) + 1 |
||||
if length <= 0 { |
||||
return decodeErrCodeUnsupportedLiteralLength |
||||
} |
||||
if length > len(dst)-d || length > len(src)-s { |
||||
return decodeErrCodeCorrupt |
||||
} |
||||
copy(dst[d:], src[s:s+length]) |
||||
d += length |
||||
s += length |
||||
continue |
||||
|
||||
case tagCopy1: |
||||
s += 2 |
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt |
||||
} |
||||
length = 4 + int(src[s-2])>>2&0x7 |
||||
offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])) |
||||
|
||||
case tagCopy2: |
||||
s += 3 |
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt |
||||
} |
||||
length = 1 + int(src[s-3])>>2 |
||||
offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8) |
||||
|
||||
case tagCopy4: |
||||
s += 5 |
||||
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
|
||||
return decodeErrCodeCorrupt |
||||
} |
||||
length = 1 + int(src[s-5])>>2 |
||||
offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24) |
||||
} |
||||
|
||||
if offset <= 0 || d < offset || length > len(dst)-d { |
||||
return decodeErrCodeCorrupt |
||||
} |
||||
// Copy from an earlier sub-slice of dst to a later sub-slice. Unlike
|
||||
// the built-in copy function, this byte-by-byte copy always runs
|
||||
// forwards, even if the slices overlap. Conceptually, this is:
|
||||
//
|
||||
// d += forwardCopy(dst[d:d+length], dst[d-offset:])
|
||||
for end := d + length; d != end; d++ { |
||||
dst[d] = dst[d-offset] |
||||
} |
||||
} |
||||
if d != len(dst) { |
||||
return decodeErrCodeCorrupt |
||||
} |
||||
return 0 |
||||
} |
@ -0,0 +1,285 @@ |
||||
// Copyright 2011 The Snappy-Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package snappy |
||||
|
||||
import ( |
||||
"encoding/binary" |
||||
"errors" |
||||
"io" |
||||
) |
||||
|
||||
// Encode returns the encoded form of src. The returned slice may be a sub-
|
||||
// slice of dst if dst was large enough to hold the entire encoded block.
|
||||
// Otherwise, a newly allocated slice will be returned.
|
||||
//
|
||||
// The dst and src must not overlap. It is valid to pass a nil dst.
|
||||
func Encode(dst, src []byte) []byte { |
||||
if n := MaxEncodedLen(len(src)); n < 0 { |
||||
panic(ErrTooLarge) |
||||
} else if len(dst) < n { |
||||
dst = make([]byte, n) |
||||
} |
||||
|
||||
// The block starts with the varint-encoded length of the decompressed bytes.
|
||||
d := binary.PutUvarint(dst, uint64(len(src))) |
||||
|
||||
for len(src) > 0 { |
||||
p := src |
||||
src = nil |
||||
if len(p) > maxBlockSize { |
||||
p, src = p[:maxBlockSize], p[maxBlockSize:] |
||||
} |
||||
if len(p) < minNonLiteralBlockSize { |
||||
d += emitLiteral(dst[d:], p) |
||||
} else { |
||||
d += encodeBlock(dst[d:], p) |
||||
} |
||||
} |
||||
return dst[:d] |
||||
} |
||||
|
||||
// inputMargin is the minimum number of extra input bytes to keep, inside
|
||||
// encodeBlock's inner loop. On some architectures, this margin lets us
|
||||
// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
|
||||
// literals can be implemented as a single load to and store from a 16-byte
|
||||
// register. That literal's actual length can be as short as 1 byte, so this
|
||||
// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
|
||||
// the encoding loop will fix up the copy overrun, and this inputMargin ensures
|
||||
// that we don't overrun the dst and src buffers.
|
||||
const inputMargin = 16 - 1 |
||||
|
||||
// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
|
||||
// could be encoded with a copy tag. This is the minimum with respect to the
|
||||
// algorithm used by encodeBlock, not a minimum enforced by the file format.
|
||||
//
|
||||
// The encoded output must start with at least a 1 byte literal, as there are
|
||||
// no previous bytes to copy. A minimal (1 byte) copy after that, generated
|
||||
// from an emitCopy call in encodeBlock's main loop, would require at least
|
||||
// another inputMargin bytes, for the reason above: we want any emitLiteral
|
||||
// calls inside encodeBlock's main loop to use the fast path if possible, which
|
||||
// requires being able to overrun by inputMargin bytes. Thus,
|
||||
// minNonLiteralBlockSize equals 1 + 1 + inputMargin.
|
||||
//
|
||||
// The C++ code doesn't use this exact threshold, but it could, as discussed at
|
||||
// https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion
|
||||
// The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an
|
||||
// optimization. It should not affect the encoded form. This is tested by
|
||||
// TestSameEncodingAsCppShortCopies.
|
||||
const minNonLiteralBlockSize = 1 + 1 + inputMargin |
||||
|
||||
// MaxEncodedLen returns the maximum length of a snappy block, given its
|
||||
// uncompressed length.
|
||||
//
|
||||
// It will return a negative value if srcLen is too large to encode.
|
||||
func MaxEncodedLen(srcLen int) int { |
||||
n := uint64(srcLen) |
||||
if n > 0xffffffff { |
||||
return -1 |
||||
} |
||||
// Compressed data can be defined as:
|
||||
// compressed := item* literal*
|
||||
// item := literal* copy
|
||||
//
|
||||
// The trailing literal sequence has a space blowup of at most 62/60
|
||||
// since a literal of length 60 needs one tag byte + one extra byte
|
||||
// for length information.
|
||||
//
|
||||
// Item blowup is trickier to measure. Suppose the "copy" op copies
|
||||
// 4 bytes of data. Because of a special check in the encoding code,
|
||||
// we produce a 4-byte copy only if the offset is < 65536. Therefore
|
||||
// the copy op takes 3 bytes to encode, and this type of item leads
|
||||
// to at most the 62/60 blowup for representing literals.
|
||||
//
|
||||
// Suppose the "copy" op copies 5 bytes of data. If the offset is big
|
||||
// enough, it will take 5 bytes to encode the copy op. Therefore the
|
||||
// worst case here is a one-byte literal followed by a five-byte copy.
|
||||
// That is, 6 bytes of input turn into 7 bytes of "compressed" data.
|
||||
//
|
||||
// This last factor dominates the blowup, so the final estimate is:
|
||||
n = 32 + n + n/6 |
||||
if n > 0xffffffff { |
||||
return -1 |
||||
} |
||||
return int(n) |
||||
} |
||||
|
||||
var errClosed = errors.New("snappy: Writer is closed") |
||||
|
||||
// NewWriter returns a new Writer that compresses to w.
|
||||
//
|
||||
// The Writer returned does not buffer writes. There is no need to Flush or
|
||||
// Close such a Writer.
|
||||
//
|
||||
// Deprecated: the Writer returned is not suitable for many small writes, only
|
||||
// for few large writes. Use NewBufferedWriter instead, which is efficient
|
||||
// regardless of the frequency and shape of the writes, and remember to Close
|
||||
// that Writer when done.
|
||||
func NewWriter(w io.Writer) *Writer { |
||||
return &Writer{ |
||||
w: w, |
||||
obuf: make([]byte, obufLen), |
||||
} |
||||
} |
||||
|
||||
// NewBufferedWriter returns a new Writer that compresses to w, using the
|
||||
// framing format described at
|
||||
// https://github.com/google/snappy/blob/master/framing_format.txt
|
||||
//
|
||||
// The Writer returned buffers writes. Users must call Close to guarantee all
|
||||
// data has been forwarded to the underlying io.Writer. They may also call
|
||||
// Flush zero or more times before calling Close.
|
||||
func NewBufferedWriter(w io.Writer) *Writer { |
||||
return &Writer{ |
||||
w: w, |
||||
ibuf: make([]byte, 0, maxBlockSize), |
||||
obuf: make([]byte, obufLen), |
||||
} |
||||
} |
||||
|
||||
// Writer is an io.Writer that can write Snappy-compressed bytes.
|
||||
type Writer struct { |
||||
w io.Writer |
||||
err error |
||||
|
||||
// ibuf is a buffer for the incoming (uncompressed) bytes.
|
||||
//
|
||||
// Its use is optional. For backwards compatibility, Writers created by the
|
||||
// NewWriter function have ibuf == nil, do not buffer incoming bytes, and
|
||||
// therefore do not need to be Flush'ed or Close'd.
|
||||
ibuf []byte |
||||
|
||||
// obuf is a buffer for the outgoing (compressed) bytes.
|
||||
obuf []byte |
||||
|
||||
// wroteStreamHeader is whether we have written the stream header.
|
||||
wroteStreamHeader bool |
||||
} |
||||
|
||||
// Reset discards the writer's state and switches the Snappy writer to write to
|
||||
// w. This permits reusing a Writer rather than allocating a new one.
|
||||
func (w *Writer) Reset(writer io.Writer) { |
||||
w.w = writer |
||||
w.err = nil |
||||
if w.ibuf != nil { |
||||
w.ibuf = w.ibuf[:0] |
||||
} |
||||
w.wroteStreamHeader = false |
||||
} |
||||
|
||||
// Write satisfies the io.Writer interface.
|
||||
func (w *Writer) Write(p []byte) (nRet int, errRet error) { |
||||
if w.ibuf == nil { |
||||
// Do not buffer incoming bytes. This does not perform or compress well
|
||||
// if the caller of Writer.Write writes many small slices. This
|
||||
// behavior is therefore deprecated, but still supported for backwards
|
||||
// compatibility with code that doesn't explicitly Flush or Close.
|
||||
return w.write(p) |
||||
} |
||||
|
||||
// The remainder of this method is based on bufio.Writer.Write from the
|
||||
// standard library.
|
||||
|
||||
for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil { |
||||
var n int |
||||
if len(w.ibuf) == 0 { |
||||
// Large write, empty buffer.
|
||||
// Write directly from p to avoid copy.
|
||||
n, _ = w.write(p) |
||||
} else { |
||||
n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p) |
||||
w.ibuf = w.ibuf[:len(w.ibuf)+n] |
||||
w.Flush() |
||||
} |
||||
nRet += n |
||||
p = p[n:] |
||||
} |
||||
if w.err != nil { |
||||
return nRet, w.err |
||||
} |
||||
n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p) |
||||
w.ibuf = w.ibuf[:len(w.ibuf)+n] |
||||
nRet += n |
||||
return nRet, nil |
||||
} |
||||
|
||||
func (w *Writer) write(p []byte) (nRet int, errRet error) { |
||||
if w.err != nil { |
||||
return 0, w.err |
||||
} |
||||
for len(p) > 0 { |
||||
obufStart := len(magicChunk) |
||||
if !w.wroteStreamHeader { |
||||
w.wroteStreamHeader = true |
||||
copy(w.obuf, magicChunk) |
||||
obufStart = 0 |
||||
} |
||||
|
||||
var uncompressed []byte |
||||
if len(p) > maxBlockSize { |
||||
uncompressed, p = p[:maxBlockSize], p[maxBlockSize:] |
||||
} else { |
||||
uncompressed, p = p, nil |
||||
} |
||||
checksum := crc(uncompressed) |
||||
|
||||
// Compress the buffer, discarding the result if the improvement
|
||||
// isn't at least 12.5%.
|
||||
compressed := Encode(w.obuf[obufHeaderLen:], uncompressed) |
||||
chunkType := uint8(chunkTypeCompressedData) |
||||
chunkLen := 4 + len(compressed) |
||||
obufEnd := obufHeaderLen + len(compressed) |
||||
if len(compressed) >= len(uncompressed)-len(uncompressed)/8 { |
||||
chunkType = chunkTypeUncompressedData |
||||
chunkLen = 4 + len(uncompressed) |
||||
obufEnd = obufHeaderLen |
||||
} |
||||
|
||||
// Fill in the per-chunk header that comes before the body.
|
||||
w.obuf[len(magicChunk)+0] = chunkType |
||||
w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0) |
||||
w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8) |
||||
w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16) |
||||
w.obuf[len(magicChunk)+4] = uint8(checksum >> 0) |
||||
w.obuf[len(magicChunk)+5] = uint8(checksum >> 8) |
||||
w.obuf[len(magicChunk)+6] = uint8(checksum >> 16) |
||||
w.obuf[len(magicChunk)+7] = uint8(checksum >> 24) |
||||
|
||||
if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil { |
||||
w.err = err |
||||
return nRet, err |
||||
} |
||||
if chunkType == chunkTypeUncompressedData { |
||||
if _, err := w.w.Write(uncompressed); err != nil { |
||||
w.err = err |
||||
return nRet, err |
||||
} |
||||
} |
||||
nRet += len(uncompressed) |
||||
} |
||||
return nRet, nil |
||||
} |
||||
|
||||
// Flush flushes the Writer to its underlying io.Writer.
|
||||
func (w *Writer) Flush() error { |
||||
if w.err != nil { |
||||
return w.err |
||||
} |
||||
if len(w.ibuf) == 0 { |
||||
return nil |
||||
} |
||||
w.write(w.ibuf) |
||||
w.ibuf = w.ibuf[:0] |
||||
return w.err |
||||
} |
||||
|
||||
// Close calls Flush and then closes the Writer.
|
||||
func (w *Writer) Close() error { |
||||
w.Flush() |
||||
ret := w.err |
||||
if w.err == nil { |
||||
w.err = errClosed |
||||
} |
||||
return ret |
||||
} |
@ -0,0 +1,29 @@ |
||||
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !appengine
|
||||
// +build gc
|
||||
// +build !noasm
|
||||
|
||||
package snappy |
||||
|
||||
// emitLiteral has the same semantics as in encode_other.go.
|
||||
//
|
||||
//go:noescape
|
||||
func emitLiteral(dst, lit []byte) int |
||||
|
||||
// emitCopy has the same semantics as in encode_other.go.
|
||||
//
|
||||
//go:noescape
|
||||
func emitCopy(dst []byte, offset, length int) int |
||||
|
||||
// extendMatch has the same semantics as in encode_other.go.
|
||||
//
|
||||
//go:noescape
|
||||
func extendMatch(src []byte, i, j int) int |
||||
|
||||
// encodeBlock has the same semantics as in encode_other.go.
|
||||
//
|
||||
//go:noescape
|
||||
func encodeBlock(dst, src []byte) (d int) |
@ -0,0 +1,730 @@ |
||||
// Copyright 2016 The Go Authors. All rights reserved. |
||||
// Use of this source code is governed by a BSD-style |
||||
// license that can be found in the LICENSE file. |
||||
|
||||
// +build !appengine |
||||
// +build gc |
||||
// +build !noasm |
||||
|
||||
#include "textflag.h" |
||||
|
||||
// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a |
||||
// Go toolchain regression. See https://github.com/golang/go/issues/15426 and |
||||
// https://github.com/golang/snappy/issues/29 |
||||
// |
||||
// As a workaround, the package was built with a known good assembler, and |
||||
// those instructions were disassembled by "objdump -d" to yield the |
||||
// 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 |
||||
// style comments, in AT&T asm syntax. Note that rsp here is a physical |
||||
// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm). |
||||
// The instructions were then encoded as "BYTE $0x.." sequences, which assemble |
||||
// fine on Go 1.6. |
||||
|
||||
// The asm code generally follows the pure Go code in encode_other.go, except |
||||
// where marked with a "!!!". |
||||
|
||||
// ---------------------------------------------------------------------------- |
||||
|
||||
// func emitLiteral(dst, lit []byte) int |
||||
// |
||||
// All local variables fit into registers. The register allocation: |
||||
// - AX len(lit) |
||||
// - BX n |
||||
// - DX return value |
||||
// - DI &dst[i] |
||||
// - R10 &lit[0] |
||||
// |
||||
// The 24 bytes of stack space is to call runtime·memmove. |
||||
// |
||||
// The unusual register allocation of local variables, such as R10 for the |
||||
// source pointer, matches the allocation used at the call site in encodeBlock, |
||||
// which makes it easier to manually inline this function. |
||||
TEXT ·emitLiteral(SB), NOSPLIT, $24-56 |
||||
MOVQ dst_base+0(FP), DI |
||||
MOVQ lit_base+24(FP), R10 |
||||
MOVQ lit_len+32(FP), AX |
||||
MOVQ AX, DX |
||||
MOVL AX, BX |
||||
SUBL $1, BX |
||||
|
||||
CMPL BX, $60 |
||||
JLT oneByte |
||||
CMPL BX, $256 |
||||
JLT twoBytes |
||||
|
||||
threeBytes: |
||||
MOVB $0xf4, 0(DI) |
||||
MOVW BX, 1(DI) |
||||
ADDQ $3, DI |
||||
ADDQ $3, DX |
||||
JMP memmove |
||||
|
||||
twoBytes: |
||||
MOVB $0xf0, 0(DI) |
||||
MOVB BX, 1(DI) |
||||
ADDQ $2, DI |
||||
ADDQ $2, DX |
||||
JMP memmove |
||||
|
||||
oneByte: |
||||
SHLB $2, BX |
||||
MOVB BX, 0(DI) |
||||
ADDQ $1, DI |
||||
ADDQ $1, DX |
||||
|
||||
memmove: |
||||
MOVQ DX, ret+48(FP) |
||||
|
||||
// copy(dst[i:], lit) |
||||
// |
||||
// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push |
||||
// DI, R10 and AX as arguments. |
||||
MOVQ DI, 0(SP) |
||||
MOVQ R10, 8(SP) |
||||
MOVQ AX, 16(SP) |
||||
CALL runtime·memmove(SB) |
||||
RET |
||||
|
||||
// ---------------------------------------------------------------------------- |
||||
|
||||
// func emitCopy(dst []byte, offset, length int) int |
||||
// |
||||
// All local variables fit into registers. The register allocation: |
||||
// - AX length |
||||
// - SI &dst[0] |
||||
// - DI &dst[i] |
||||
// - R11 offset |
||||
// |
||||
// The unusual register allocation of local variables, such as R11 for the |
||||
// offset, matches the allocation used at the call site in encodeBlock, which |
||||
// makes it easier to manually inline this function. |
||||
TEXT ·emitCopy(SB), NOSPLIT, $0-48 |
||||
MOVQ dst_base+0(FP), DI |
||||
MOVQ DI, SI |
||||
MOVQ offset+24(FP), R11 |
||||
MOVQ length+32(FP), AX |
||||
|
||||
loop0: |
||||
// for length >= 68 { etc } |
||||
CMPL AX, $68 |
||||
JLT step1 |
||||
|
||||
// Emit a length 64 copy, encoded as 3 bytes. |
||||
MOVB $0xfe, 0(DI) |
||||
MOVW R11, 1(DI) |
||||
ADDQ $3, DI |
||||
SUBL $64, AX |
||||
JMP loop0 |
||||
|
||||
step1: |
||||
// if length > 64 { etc } |
||||
CMPL AX, $64 |
||||
JLE step2 |
||||
|
||||
// Emit a length 60 copy, encoded as 3 bytes. |
||||
MOVB $0xee, 0(DI) |
||||
MOVW R11, 1(DI) |
||||
ADDQ $3, DI |
||||
SUBL $60, AX |
||||
|
||||
step2: |
||||
// if length >= 12 || offset >= 2048 { goto step3 } |
||||
CMPL AX, $12 |
||||
JGE step3 |
||||
CMPL R11, $2048 |
||||
JGE step3 |
||||
|
||||
// Emit the remaining copy, encoded as 2 bytes. |
||||
MOVB R11, 1(DI) |
||||
SHRL $8, R11 |
||||
SHLB $5, R11 |
||||
SUBB $4, AX |
||||
SHLB $2, AX |
||||
ORB AX, R11 |
||||
ORB $1, R11 |
||||
MOVB R11, 0(DI) |
||||
ADDQ $2, DI |
||||
|
||||
// Return the number of bytes written. |
||||
SUBQ SI, DI |
||||
MOVQ DI, ret+40(FP) |
||||
RET |
||||
|
||||
step3: |
||||
// Emit the remaining copy, encoded as 3 bytes. |
||||
SUBL $1, AX |
||||
SHLB $2, AX |
||||
ORB $2, AX |
||||
MOVB AX, 0(DI) |
||||
MOVW R11, 1(DI) |
||||
ADDQ $3, DI |
||||
|
||||
// Return the number of bytes written. |
||||
SUBQ SI, DI |
||||
MOVQ DI, ret+40(FP) |
||||
RET |
||||
|
||||
// ---------------------------------------------------------------------------- |
||||
|
||||
// func extendMatch(src []byte, i, j int) int |
||||
// |
||||
// All local variables fit into registers. The register allocation: |
||||
// - DX &src[0] |
||||
// - SI &src[j] |
||||
// - R13 &src[len(src) - 8] |
||||
// - R14 &src[len(src)] |
||||
// - R15 &src[i] |
||||
// |
||||
// The unusual register allocation of local variables, such as R15 for a source |
||||
// pointer, matches the allocation used at the call site in encodeBlock, which |
||||
// makes it easier to manually inline this function. |
||||
TEXT ·extendMatch(SB), NOSPLIT, $0-48 |
||||
MOVQ src_base+0(FP), DX |
||||
MOVQ src_len+8(FP), R14 |
||||
MOVQ i+24(FP), R15 |
||||
MOVQ j+32(FP), SI |
||||
ADDQ DX, R14 |
||||
ADDQ DX, R15 |
||||
ADDQ DX, SI |
||||
MOVQ R14, R13 |
||||
SUBQ $8, R13 |
||||
|
||||
cmp8: |
||||
// As long as we are 8 or more bytes before the end of src, we can load and |
||||
// compare 8 bytes at a time. If those 8 bytes are equal, repeat. |
||||
CMPQ SI, R13 |
||||
JA cmp1 |
||||
MOVQ (R15), AX |
||||
MOVQ (SI), BX |
||||
CMPQ AX, BX |
||||
JNE bsf |
||||
ADDQ $8, R15 |
||||
ADDQ $8, SI |
||||
JMP cmp8 |
||||
|
||||
bsf: |
||||
// If those 8 bytes were not equal, XOR the two 8 byte values, and return |
||||
// the index of the first byte that differs. The BSF instruction finds the |
||||
// least significant 1 bit, the amd64 architecture is little-endian, and |
||||
// the shift by 3 converts a bit index to a byte index. |
||||
XORQ AX, BX |
||||
BSFQ BX, BX |
||||
SHRQ $3, BX |
||||
ADDQ BX, SI |
||||
|
||||
// Convert from &src[ret] to ret. |
||||
SUBQ DX, SI |
||||
MOVQ SI, ret+40(FP) |
||||
RET |
||||
|
||||
cmp1: |
||||
// In src's tail, compare 1 byte at a time. |
||||
CMPQ SI, R14 |
||||
JAE extendMatchEnd |
||||
MOVB (R15), AX |
||||
MOVB (SI), BX |
||||
CMPB AX, BX |
||||
JNE extendMatchEnd |
||||
ADDQ $1, R15 |
||||
ADDQ $1, SI |
||||
JMP cmp1 |
||||
|
||||
extendMatchEnd: |
||||
// Convert from &src[ret] to ret. |
||||
SUBQ DX, SI |
||||
MOVQ SI, ret+40(FP) |
||||
RET |
||||
|
||||
// ---------------------------------------------------------------------------- |
||||
|
||||
// func encodeBlock(dst, src []byte) (d int) |
||||
// |
||||
// All local variables fit into registers, other than "var table". The register |
||||
// allocation: |
||||
// - AX . . |
||||
// - BX . . |
||||
// - CX 56 shift (note that amd64 shifts by non-immediates must use CX). |
||||
// - DX 64 &src[0], tableSize |
||||
// - SI 72 &src[s] |
||||
// - DI 80 &dst[d] |
||||
// - R9 88 sLimit |
||||
// - R10 . &src[nextEmit] |
||||
// - R11 96 prevHash, currHash, nextHash, offset |
||||
// - R12 104 &src[base], skip |
||||
// - R13 . &src[nextS], &src[len(src) - 8] |
||||
// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x |
||||
// - R15 112 candidate |
||||
// |
||||
// The second column (56, 64, etc) is the stack offset to spill the registers |
||||
// when calling other functions. We could pack this slightly tighter, but it's |
||||
// simpler to have a dedicated spill map independent of the function called. |
||||
// |
||||
// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An |
||||
// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill |
||||
// local variables (registers) during calls gives 32768 + 56 + 64 = 32888. |
||||
TEXT ·encodeBlock(SB), 0, $32888-56 |
||||
MOVQ dst_base+0(FP), DI |
||||
MOVQ src_base+24(FP), SI |
||||
MOVQ src_len+32(FP), R14 |
||||
|
||||
// shift, tableSize := uint32(32-8), 1<<8 |
||||
MOVQ $24, CX |
||||
MOVQ $256, DX |
||||
|
||||
calcShift: |
||||
// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
|
||||
// shift-- |
||||
// } |
||||
CMPQ DX, $16384 |
||||
JGE varTable |
||||
CMPQ DX, R14 |
||||
JGE varTable |
||||
SUBQ $1, CX |
||||
SHLQ $1, DX |
||||
JMP calcShift |
||||
|
||||
varTable: |
||||
// var table [maxTableSize]uint16 |
||||
// |
||||
// In the asm code, unlike the Go code, we can zero-initialize only the |
||||
// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU |
||||
// writes 16 bytes, so we can do only tableSize/8 writes instead of the |
||||
// 2048 writes that would zero-initialize all of table's 32768 bytes. |
||||
SHRQ $3, DX |
||||
LEAQ table-32768(SP), BX |
||||
PXOR X0, X0 |
||||
|
||||
memclr: |
||||
MOVOU X0, 0(BX) |
||||
ADDQ $16, BX |
||||
SUBQ $1, DX |
||||
JNZ memclr |
||||
|
||||
// !!! DX = &src[0] |
||||
MOVQ SI, DX |
||||
|
||||
// sLimit := len(src) - inputMargin |
||||
MOVQ R14, R9 |
||||
SUBQ $15, R9 |
||||
|
||||
// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't |
||||
// change for the rest of the function. |
||||
MOVQ CX, 56(SP) |
||||
MOVQ DX, 64(SP) |
||||
MOVQ R9, 88(SP) |
||||
|
||||
// nextEmit := 0 |
||||
MOVQ DX, R10 |
||||
|
||||
// s := 1 |
||||
ADDQ $1, SI |
||||
|
||||
// nextHash := hash(load32(src, s), shift) |
||||
MOVL 0(SI), R11 |
||||
IMULL $0x1e35a7bd, R11 |
||||
SHRL CX, R11 |
||||
|
||||
outer: |
||||
// for { etc } |
||||
|
||||
// skip := 32 |
||||
MOVQ $32, R12 |
||||
|
||||
// nextS := s |
||||
MOVQ SI, R13 |
||||
|
||||
// candidate := 0 |
||||
MOVQ $0, R15 |
||||
|
||||
inner0: |
||||
// for { etc } |
||||
|
||||
// s := nextS |
||||
MOVQ R13, SI |
||||
|
||||
// bytesBetweenHashLookups := skip >> 5 |
||||
MOVQ R12, R14 |
||||
SHRQ $5, R14 |
||||
|
||||
// nextS = s + bytesBetweenHashLookups |
||||
ADDQ R14, R13 |
||||
|
||||
// skip += bytesBetweenHashLookups |
||||
ADDQ R14, R12 |
||||
|
||||
// if nextS > sLimit { goto emitRemainder } |
||||
MOVQ R13, AX |
||||
SUBQ DX, AX |
||||
CMPQ AX, R9 |
||||
JA emitRemainder |
||||
|
||||
// candidate = int(table[nextHash]) |
||||
// XXX: MOVWQZX table-32768(SP)(R11*2), R15 |
||||
// XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 |
||||
BYTE $0x4e |
||||
BYTE $0x0f |
||||
BYTE $0xb7 |
||||
BYTE $0x7c |
||||
BYTE $0x5c |
||||
BYTE $0x78 |
||||
|
||||
// table[nextHash] = uint16(s) |
||||
MOVQ SI, AX |
||||
SUBQ DX, AX |
||||
|
||||
// XXX: MOVW AX, table-32768(SP)(R11*2) |
||||
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) |
||||
BYTE $0x66 |
||||
BYTE $0x42 |
||||
BYTE $0x89 |
||||
BYTE $0x44 |
||||
BYTE $0x5c |
||||
BYTE $0x78 |
||||
|
||||
// nextHash = hash(load32(src, nextS), shift) |
||||
MOVL 0(R13), R11 |
||||
IMULL $0x1e35a7bd, R11 |
||||
SHRL CX, R11 |
||||
|
||||
// if load32(src, s) != load32(src, candidate) { continue } break |
||||
MOVL 0(SI), AX |
||||
MOVL (DX)(R15*1), BX |
||||
CMPL AX, BX |
||||
JNE inner0 |
||||
|
||||
fourByteMatch: |
||||
// As per the encode_other.go code: |
||||
// |
||||
// A 4-byte match has been found. We'll later see etc. |
||||
|
||||
// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment |
||||
// on inputMargin in encode.go. |
||||
MOVQ SI, AX |
||||
SUBQ R10, AX |
||||
CMPQ AX, $16 |
||||
JLE emitLiteralFastPath |
||||
|
||||
// ---------------------------------------- |
||||
// Begin inline of the emitLiteral call. |
||||
// |
||||
// d += emitLiteral(dst[d:], src[nextEmit:s]) |
||||
|
||||
MOVL AX, BX |
||||
SUBL $1, BX |
||||
|
||||
CMPL BX, $60 |
||||
JLT inlineEmitLiteralOneByte |
||||
CMPL BX, $256 |
||||
JLT inlineEmitLiteralTwoBytes |
||||
|
||||
inlineEmitLiteralThreeBytes: |
||||
MOVB $0xf4, 0(DI) |
||||
MOVW BX, 1(DI) |
||||
ADDQ $3, DI |
||||
JMP inlineEmitLiteralMemmove |
||||
|
||||
inlineEmitLiteralTwoBytes: |
||||
MOVB $0xf0, 0(DI) |
||||
MOVB BX, 1(DI) |
||||
ADDQ $2, DI |
||||
JMP inlineEmitLiteralMemmove |
||||
|
||||
inlineEmitLiteralOneByte: |
||||
SHLB $2, BX |
||||
MOVB BX, 0(DI) |
||||
ADDQ $1, DI |
||||
|
||||
inlineEmitLiteralMemmove: |
||||
// Spill local variables (registers) onto the stack; call; unspill.
|
||||
// |
||||
// copy(dst[i:], lit) |
||||
// |
||||
// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push |
||||
// DI, R10 and AX as arguments. |
||||
MOVQ DI, 0(SP) |
||||
MOVQ R10, 8(SP) |
||||
MOVQ AX, 16(SP) |
||||
ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)". |
||||
MOVQ SI, 72(SP) |
||||
MOVQ DI, 80(SP) |
||||
MOVQ R15, 112(SP) |
||||
CALL runtime·memmove(SB) |
||||
MOVQ 56(SP), CX |
||||
MOVQ 64(SP), DX |
||||
MOVQ 72(SP), SI |
||||
MOVQ 80(SP), DI |
||||
MOVQ 88(SP), R9 |
||||
MOVQ 112(SP), R15 |
||||
JMP inner1 |
||||
|
||||
inlineEmitLiteralEnd: |
||||
// End inline of the emitLiteral call. |
||||
// ---------------------------------------- |
||||
|
||||
emitLiteralFastPath: |
||||
// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2". |
||||
MOVB AX, BX |
||||
SUBB $1, BX |
||||
SHLB $2, BX |
||||
MOVB BX, (DI) |
||||
ADDQ $1, DI |
||||
|
||||
// !!! Implement the copy from lit to dst as a 16-byte load and store. |
||||
// (Encode's documentation says that dst and src must not overlap.) |
||||
// |
||||
// This always copies 16 bytes, instead of only len(lit) bytes, but that's |
||||
// OK. Subsequent iterations will fix up the overrun. |
||||
// |
||||
// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or |
||||
// 16-byte loads and stores. This technique probably wouldn't be as |
||||
// effective on architectures that are fussier about alignment. |
||||
MOVOU 0(R10), X0 |
||||
MOVOU X0, 0(DI) |
||||
ADDQ AX, DI |
||||
|
||||
inner1: |
||||
// for { etc } |
||||
|
||||
// base := s |
||||
MOVQ SI, R12 |
||||
|
||||
// !!! offset := base - candidate |
||||
MOVQ R12, R11 |
||||
SUBQ R15, R11 |
||||
SUBQ DX, R11 |
||||
|
||||
// ---------------------------------------- |
||||
// Begin inline of the extendMatch call. |
||||
// |
||||
// s = extendMatch(src, candidate+4, s+4) |
||||
|
||||
// !!! R14 = &src[len(src)] |
||||
MOVQ src_len+32(FP), R14 |
||||
ADDQ DX, R14 |
||||
|
||||
// !!! R13 = &src[len(src) - 8] |
||||
MOVQ R14, R13 |
||||
SUBQ $8, R13 |
||||
|
||||
// !!! R15 = &src[candidate + 4] |
||||
ADDQ $4, R15 |
||||
ADDQ DX, R15 |
||||
|
||||
// !!! s += 4 |
||||
ADDQ $4, SI |
||||
|
||||
inlineExtendMatchCmp8: |
||||
// As long as we are 8 or more bytes before the end of src, we can load and |
||||
// compare 8 bytes at a time. If those 8 bytes are equal, repeat. |
||||
CMPQ SI, R13 |
||||
JA inlineExtendMatchCmp1 |
||||
MOVQ (R15), AX |
||||
MOVQ (SI), BX |
||||
CMPQ AX, BX |
||||
JNE inlineExtendMatchBSF |
||||
ADDQ $8, R15 |
||||
ADDQ $8, SI |
||||
JMP inlineExtendMatchCmp8 |
||||
|
||||
inlineExtendMatchBSF: |
||||
// If those 8 bytes were not equal, XOR the two 8 byte values, and return |
||||
// the index of the first byte that differs. The BSF instruction finds the |
||||
// least significant 1 bit, the amd64 architecture is little-endian, and |
||||
// the shift by 3 converts a bit index to a byte index. |
||||
XORQ AX, BX |
||||
BSFQ BX, BX |
||||
SHRQ $3, BX |
||||
ADDQ BX, SI |
||||
JMP inlineExtendMatchEnd |
||||
|
||||
inlineExtendMatchCmp1: |
||||
// In src's tail, compare 1 byte at a time. |
||||
CMPQ SI, R14 |
||||
JAE inlineExtendMatchEnd |
||||
MOVB (R15), AX |
||||
MOVB (SI), BX |
||||
CMPB AX, BX |
||||
JNE inlineExtendMatchEnd |
||||
ADDQ $1, R15 |
||||
ADDQ $1, SI |
||||
JMP inlineExtendMatchCmp1 |
||||
|
||||
inlineExtendMatchEnd: |
||||
// End inline of the extendMatch call. |
||||
// ---------------------------------------- |
||||
|
||||
// ---------------------------------------- |
||||
// Begin inline of the emitCopy call. |
||||
// |
||||
// d += emitCopy(dst[d:], base-candidate, s-base) |
||||
|
||||
// !!! length := s - base |
||||
MOVQ SI, AX |
||||
SUBQ R12, AX |
||||
|
||||
inlineEmitCopyLoop0: |
||||
// for length >= 68 { etc } |
||||
CMPL AX, $68 |
||||
JLT inlineEmitCopyStep1 |
||||
|
||||
// Emit a length 64 copy, encoded as 3 bytes. |
||||
MOVB $0xfe, 0(DI) |
||||
MOVW R11, 1(DI) |
||||
ADDQ $3, DI |
||||
SUBL $64, AX |
||||
JMP inlineEmitCopyLoop0 |
||||
|
||||
inlineEmitCopyStep1: |
||||
// if length > 64 { etc } |
||||
CMPL AX, $64 |
||||
JLE inlineEmitCopyStep2 |
||||
|
||||
// Emit a length 60 copy, encoded as 3 bytes. |
||||
MOVB $0xee, 0(DI) |
||||
MOVW R11, 1(DI) |
||||
ADDQ $3, DI |
||||
SUBL $60, AX |
||||
|
||||
inlineEmitCopyStep2: |
||||
// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 } |
||||
CMPL AX, $12 |
||||
JGE inlineEmitCopyStep3 |
||||
CMPL R11, $2048 |
||||
JGE inlineEmitCopyStep3 |
||||
|
||||
// Emit the remaining copy, encoded as 2 bytes. |
||||
MOVB R11, 1(DI) |
||||
SHRL $8, R11 |
||||
SHLB $5, R11 |
||||
SUBB $4, AX |
||||
SHLB $2, AX |
||||
ORB AX, R11 |
||||
ORB $1, R11 |
||||
MOVB R11, 0(DI) |
||||
ADDQ $2, DI |
||||
JMP inlineEmitCopyEnd |
||||
|
||||
inlineEmitCopyStep3: |
||||
// Emit the remaining copy, encoded as 3 bytes. |
||||
SUBL $1, AX |
||||
SHLB $2, AX |
||||
ORB $2, AX |
||||
MOVB AX, 0(DI) |
||||
MOVW R11, 1(DI) |
||||
ADDQ $3, DI |
||||
|
||||
inlineEmitCopyEnd: |
||||
// End inline of the emitCopy call. |
||||
// ---------------------------------------- |
||||
|
||||
// nextEmit = s |
||||
MOVQ SI, R10 |
||||
|
||||
// if s >= sLimit { goto emitRemainder } |
||||
MOVQ SI, AX |
||||
SUBQ DX, AX |
||||
CMPQ AX, R9 |
||||
JAE emitRemainder |
||||
|
||||
// As per the encode_other.go code: |
||||
// |
||||
// We could immediately etc. |
||||
|
||||
// x := load64(src, s-1) |
||||
MOVQ -1(SI), R14 |
||||
|
||||
// prevHash := hash(uint32(x>>0), shift) |
||||
MOVL R14, R11 |
||||
IMULL $0x1e35a7bd, R11 |
||||
SHRL CX, R11 |
||||
|
||||
// table[prevHash] = uint16(s-1) |
||||
MOVQ SI, AX |
||||
SUBQ DX, AX |
||||
SUBQ $1, AX |
||||
|
||||
// XXX: MOVW AX, table-32768(SP)(R11*2) |
||||
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) |
||||
BYTE $0x66 |
||||
BYTE $0x42 |
||||
BYTE $0x89 |
||||
BYTE $0x44 |
||||
BYTE $0x5c |
||||
BYTE $0x78 |
||||
|
||||
// currHash := hash(uint32(x>>8), shift) |
||||
SHRQ $8, R14 |
||||
MOVL R14, R11 |
||||
IMULL $0x1e35a7bd, R11 |
||||
SHRL CX, R11 |
||||
|
||||
// candidate = int(table[currHash]) |
||||
// XXX: MOVWQZX table-32768(SP)(R11*2), R15 |
||||
// XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15 |
||||
BYTE $0x4e |
||||
BYTE $0x0f |
||||
BYTE $0xb7 |
||||
BYTE $0x7c |
||||
BYTE $0x5c |
||||
BYTE $0x78 |
||||
|
||||
// table[currHash] = uint16(s) |
||||
ADDQ $1, AX |
||||
|
||||
// XXX: MOVW AX, table-32768(SP)(R11*2) |
||||
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2) |
||||
BYTE $0x66 |
||||
BYTE $0x42 |
||||
BYTE $0x89 |
||||
BYTE $0x44 |
||||
BYTE $0x5c |
||||
BYTE $0x78 |
||||
|
||||
// if uint32(x>>8) == load32(src, candidate) { continue } |
||||
MOVL (DX)(R15*1), BX |
||||
CMPL R14, BX |
||||
JEQ inner1 |
||||
|
||||
// nextHash = hash(uint32(x>>16), shift) |
||||
SHRQ $8, R14 |
||||
MOVL R14, R11 |
||||
IMULL $0x1e35a7bd, R11 |
||||
SHRL CX, R11 |
||||
|
||||
// s++ |
||||
ADDQ $1, SI |
||||
|
||||
// break out of the inner1 for loop, i.e. continue the outer loop. |
||||
JMP outer |
||||
|
||||
emitRemainder: |
||||
// if nextEmit < len(src) { etc } |
||||
MOVQ src_len+32(FP), AX |
||||
ADDQ DX, AX |
||||
CMPQ R10, AX |
||||
JEQ encodeBlockEnd |
||||
|
||||
// d += emitLiteral(dst[d:], src[nextEmit:]) |
||||
// |
||||
// Push args. |
||||
MOVQ DI, 0(SP) |
||||
MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative. |
||||
MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative. |
||||
MOVQ R10, 24(SP) |
||||
SUBQ R10, AX |
||||
MOVQ AX, 32(SP) |
||||
MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative. |
||||
|
||||
// Spill local variables (registers) onto the stack; call; unspill.
|
||||
MOVQ DI, 80(SP) |
||||
CALL ·emitLiteral(SB) |
||||
MOVQ 80(SP), DI |
||||
|
||||
// Finish the "d +=" part of "d += emitLiteral(etc)". |
||||
ADDQ 48(SP), DI |
||||
|
||||
encodeBlockEnd: |
||||
MOVQ dst_base+0(FP), AX |
||||
SUBQ AX, DI |
||||
MOVQ DI, d+48(FP) |
||||
RET |
@ -0,0 +1,238 @@ |
||||
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64 appengine !gc noasm
|
||||
|
||||
package snappy |
||||
|
||||
func load32(b []byte, i int) uint32 { |
||||
b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
|
||||
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 |
||||
} |
||||
|
||||
func load64(b []byte, i int) uint64 { |
||||
b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
|
||||
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | |
||||
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 |
||||
} |
||||
|
||||
// emitLiteral writes a literal chunk and returns the number of bytes written.
|
||||
//
|
||||
// It assumes that:
|
||||
// dst is long enough to hold the encoded bytes
|
||||
// 1 <= len(lit) && len(lit) <= 65536
|
||||
func emitLiteral(dst, lit []byte) int { |
||||
i, n := 0, uint(len(lit)-1) |
||||
switch { |
||||
case n < 60: |
||||
dst[0] = uint8(n)<<2 | tagLiteral |
||||
i = 1 |
||||
case n < 1<<8: |
||||
dst[0] = 60<<2 | tagLiteral |
||||
dst[1] = uint8(n) |
||||
i = 2 |
||||
default: |
||||
dst[0] = 61<<2 | tagLiteral |
||||
dst[1] = uint8(n) |
||||
dst[2] = uint8(n >> 8) |
||||
i = 3 |
||||
} |
||||
return i + copy(dst[i:], lit) |
||||
} |
||||
|
||||
// emitCopy writes a copy chunk and returns the number of bytes written.
|
||||
//
|
||||
// It assumes that:
|
||||
// dst is long enough to hold the encoded bytes
|
||||
// 1 <= offset && offset <= 65535
|
||||
// 4 <= length && length <= 65535
|
||||
func emitCopy(dst []byte, offset, length int) int { |
||||
i := 0 |
||||
// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
|
||||
// threshold for this loop is a little higher (at 68 = 64 + 4), and the
|
||||
// length emitted down below is is a little lower (at 60 = 64 - 4), because
|
||||
// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
|
||||
// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
|
||||
// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
|
||||
// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
|
||||
// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
|
||||
// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
|
||||
for length >= 68 { |
||||
// Emit a length 64 copy, encoded as 3 bytes.
|
||||
dst[i+0] = 63<<2 | tagCopy2 |
||||
dst[i+1] = uint8(offset) |
||||
dst[i+2] = uint8(offset >> 8) |
||||
i += 3 |
||||
length -= 64 |
||||
} |
||||
if length > 64 { |
||||
// Emit a length 60 copy, encoded as 3 bytes.
|
||||
dst[i+0] = 59<<2 | tagCopy2 |
||||
dst[i+1] = uint8(offset) |
||||
dst[i+2] = uint8(offset >> 8) |
||||
i += 3 |
||||
length -= 60 |
||||
} |
||||
if length >= 12 || offset >= 2048 { |
||||
// Emit the remaining copy, encoded as 3 bytes.
|
||||
dst[i+0] = uint8(length-1)<<2 | tagCopy2 |
||||
dst[i+1] = uint8(offset) |
||||
dst[i+2] = uint8(offset >> 8) |
||||
return i + 3 |
||||
} |
||||
// Emit the remaining copy, encoded as 2 bytes.
|
||||
dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 |
||||
dst[i+1] = uint8(offset) |
||||
return i + 2 |
||||
} |
||||
|
||||
// extendMatch returns the largest k such that k <= len(src) and that
|
||||
// src[i:i+k-j] and src[j:k] have the same contents.
|
||||
//
|
||||
// It assumes that:
|
||||
// 0 <= i && i < j && j <= len(src)
|
||||
func extendMatch(src []byte, i, j int) int { |
||||
for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 { |
||||
} |
||||
return j |
||||
} |
||||
|
||||
func hash(u, shift uint32) uint32 { |
||||
return (u * 0x1e35a7bd) >> shift |
||||
} |
||||
|
||||
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
|
||||
// assumes that the varint-encoded length of the decompressed bytes has already
|
||||
// been written.
|
||||
//
|
||||
// It also assumes that:
|
||||
// len(dst) >= MaxEncodedLen(len(src)) &&
|
||||
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
|
||||
func encodeBlock(dst, src []byte) (d int) { |
||||
// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
|
||||
// The table element type is uint16, as s < sLimit and sLimit < len(src)
|
||||
// and len(src) <= maxBlockSize and maxBlockSize == 65536.
|
||||
const ( |
||||
maxTableSize = 1 << 14 |
||||
// tableMask is redundant, but helps the compiler eliminate bounds
|
||||
// checks.
|
||||
tableMask = maxTableSize - 1 |
||||
) |
||||
shift := uint32(32 - 8) |
||||
for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 { |
||||
shift-- |
||||
} |
||||
// In Go, all array elements are zero-initialized, so there is no advantage
|
||||
// to a smaller tableSize per se. However, it matches the C++ algorithm,
|
||||
// and in the asm versions of this code, we can get away with zeroing only
|
||||
// the first tableSize elements.
|
||||
var table [maxTableSize]uint16 |
||||
|
||||
// sLimit is when to stop looking for offset/length copies. The inputMargin
|
||||
// lets us use a fast path for emitLiteral in the main loop, while we are
|
||||
// looking for copies.
|
||||
sLimit := len(src) - inputMargin |
||||
|
||||
// nextEmit is where in src the next emitLiteral should start from.
|
||||
nextEmit := 0 |
||||
|
||||
// The encoded form must start with a literal, as there are no previous
|
||||
// bytes to copy, so we start looking for hash matches at s == 1.
|
||||
s := 1 |
||||
nextHash := hash(load32(src, s), shift) |
||||
|
||||
for { |
||||
// Copied from the C++ snappy implementation:
|
||||
//
|
||||
// Heuristic match skipping: If 32 bytes are scanned with no matches
|
||||
// found, start looking only at every other byte. If 32 more bytes are
|
||||
// scanned (or skipped), look at every third byte, etc.. When a match
|
||||
// is found, immediately go back to looking at every byte. This is a
|
||||
// small loss (~5% performance, ~0.1% density) for compressible data
|
||||
// due to more bookkeeping, but for non-compressible data (such as
|
||||
// JPEG) it's a huge win since the compressor quickly "realizes" the
|
||||
// data is incompressible and doesn't bother looking for matches
|
||||
// everywhere.
|
||||
//
|
||||
// The "skip" variable keeps track of how many bytes there are since
|
||||
// the last match; dividing it by 32 (ie. right-shifting by five) gives
|
||||
// the number of bytes to move ahead for each iteration.
|
||||
skip := 32 |
||||
|
||||
nextS := s |
||||
candidate := 0 |
||||
for { |
||||
s = nextS |
||||
bytesBetweenHashLookups := skip >> 5 |
||||
nextS = s + bytesBetweenHashLookups |
||||
skip += bytesBetweenHashLookups |
||||
if nextS > sLimit { |
||||
goto emitRemainder |
||||
} |
||||
candidate = int(table[nextHash&tableMask]) |
||||
table[nextHash&tableMask] = uint16(s) |
||||
nextHash = hash(load32(src, nextS), shift) |
||||
if load32(src, s) == load32(src, candidate) { |
||||
break |
||||
} |
||||
} |
||||
|
||||
// A 4-byte match has been found. We'll later see if more than 4 bytes
|
||||
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
|
||||
// them as literal bytes.
|
||||
d += emitLiteral(dst[d:], src[nextEmit:s]) |
||||
|
||||
// Call emitCopy, and then see if another emitCopy could be our next
|
||||
// move. Repeat until we find no match for the input immediately after
|
||||
// what was consumed by the last emitCopy call.
|
||||
//
|
||||
// If we exit this loop normally then we need to call emitLiteral next,
|
||||
// though we don't yet know how big the literal will be. We handle that
|
||||
// by proceeding to the next iteration of the main loop. We also can
|
||||
// exit this loop via goto if we get close to exhausting the input.
|
||||
for { |
||||
// Invariant: we have a 4-byte match at s, and no need to emit any
|
||||
// literal bytes prior to s.
|
||||
base := s |
||||
|
||||
// Extend the 4-byte match as long as possible.
|
||||
//
|
||||
// This is an inlined version of:
|
||||
// s = extendMatch(src, candidate+4, s+4)
|
||||
s += 4 |
||||
for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 { |
||||
} |
||||
|
||||
d += emitCopy(dst[d:], base-candidate, s-base) |
||||
nextEmit = s |
||||
if s >= sLimit { |
||||
goto emitRemainder |
||||
} |
||||
|
||||
// We could immediately start working at s now, but to improve
|
||||
// compression we first update the hash table at s-1 and at s. If
|
||||
// another emitCopy is not our next move, also calculate nextHash
|
||||
// at s+1. At least on GOARCH=amd64, these three hash calculations
|
||||
// are faster as one load64 call (with some shifts) instead of
|
||||
// three load32 calls.
|
||||
x := load64(src, s-1) |
||||
prevHash := hash(uint32(x>>0), shift) |
||||
table[prevHash&tableMask] = uint16(s - 1) |
||||
currHash := hash(uint32(x>>8), shift) |
||||
candidate = int(table[currHash&tableMask]) |
||||
table[currHash&tableMask] = uint16(s) |
||||
if uint32(x>>8) != load32(src, candidate) { |
||||
nextHash = hash(uint32(x>>16), shift) |
||||
s++ |
||||
break |
||||
} |
||||
} |
||||
} |
||||
|
||||
emitRemainder: |
||||
if nextEmit < len(src) { |
||||
d += emitLiteral(dst[d:], src[nextEmit:]) |
||||
} |
||||
return d |
||||
} |
@ -0,0 +1 @@ |
||||
module github.com/golang/snappy |
@ -0,0 +1,98 @@ |
||||
// Copyright 2011 The Snappy-Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package snappy implements the Snappy compression format. It aims for very
|
||||
// high speeds and reasonable compression.
|
||||
//
|
||||
// There are actually two Snappy formats: block and stream. They are related,
|
||||
// but different: trying to decompress block-compressed data as a Snappy stream
|
||||
// will fail, and vice versa. The block format is the Decode and Encode
|
||||
// functions and the stream format is the Reader and Writer types.
|
||||
//
|
||||
// The block format, the more common case, is used when the complete size (the
|
||||
// number of bytes) of the original data is known upfront, at the time
|
||||
// compression starts. The stream format, also known as the framing format, is
|
||||
// for when that isn't always true.
|
||||
//
|
||||
// The canonical, C++ implementation is at https://github.com/google/snappy and
|
||||
// it only implements the block format.
|
||||
package snappy // import "github.com/golang/snappy"
|
||||
|
||||
import ( |
||||
"hash/crc32" |
||||
) |
||||
|
||||
/* |
||||
Each encoded block begins with the varint-encoded length of the decoded data, |
||||
followed by a sequence of chunks. Chunks begin and end on byte boundaries. The |
||||
first byte of each chunk is broken into its 2 least and 6 most significant bits |
||||
called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag. |
||||
Zero means a literal tag. All other values mean a copy tag. |
||||
|
||||
For literal tags: |
||||
- If m < 60, the next 1 + m bytes are literal bytes. |
||||
- Otherwise, let n be the little-endian unsigned integer denoted by the next |
||||
m - 59 bytes. The next 1 + n bytes after that are literal bytes. |
||||
|
||||
For copy tags, length bytes are copied from offset bytes ago, in the style of |
||||
Lempel-Ziv compression algorithms. In particular: |
||||
- For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12). |
||||
The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10 |
||||
of the offset. The next byte is bits 0-7 of the offset. |
||||
- For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65). |
||||
The length is 1 + m. The offset is the little-endian unsigned integer |
||||
denoted by the next 2 bytes. |
||||
- For l == 3, this tag is a legacy format that is no longer issued by most |
||||
encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in |
||||
[1, 65). The length is 1 + m. The offset is the little-endian unsigned |
||||
integer denoted by the next 4 bytes. |
||||
*/ |
||||
const ( |
||||
tagLiteral = 0x00 |
||||
tagCopy1 = 0x01 |
||||
tagCopy2 = 0x02 |
||||
tagCopy4 = 0x03 |
||||
) |
||||
|
||||
const ( |
||||
checksumSize = 4 |
||||
chunkHeaderSize = 4 |
||||
magicChunk = "\xff\x06\x00\x00" + magicBody |
||||
magicBody = "sNaPpY" |
||||
|
||||
// maxBlockSize is the maximum size of the input to encodeBlock. It is not
|
||||
// part of the wire format per se, but some parts of the encoder assume
|
||||
// that an offset fits into a uint16.
|
||||
//
|
||||
// Also, for the framing format (Writer type instead of Encode function),
|
||||
// https://github.com/google/snappy/blob/master/framing_format.txt says
|
||||
// that "the uncompressed data in a chunk must be no longer than 65536
|
||||
// bytes".
|
||||
maxBlockSize = 65536 |
||||
|
||||
// maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
|
||||
// hard coded to be a const instead of a variable, so that obufLen can also
|
||||
// be a const. Their equivalence is confirmed by
|
||||
// TestMaxEncodedLenOfMaxBlockSize.
|
||||
maxEncodedLenOfMaxBlockSize = 76490 |
||||
|
||||
obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize |
||||
obufLen = obufHeaderLen + maxEncodedLenOfMaxBlockSize |
||||
) |
||||
|
||||
const ( |
||||
chunkTypeCompressedData = 0x00 |
||||
chunkTypeUncompressedData = 0x01 |
||||
chunkTypePadding = 0xfe |
||||
chunkTypeStreamIdentifier = 0xff |
||||
) |
||||
|
||||
var crcTable = crc32.MakeTable(crc32.Castagnoli) |
||||
|
||||
// crc implements the checksum specified in section 3 of
|
||||
// https://github.com/google/snappy/blob/master/framing_format.txt
|
||||
func crc(b []byte) uint32 { |
||||
c := crc32.Update(0, crcTable, b) |
||||
return uint32(c>>15|c<<17) + 0xa282ead8 |
||||
} |
@ -0,0 +1 @@ |
||||
*.test |
@ -0,0 +1,38 @@ |
||||
Goavro was originally created during the Fall of 2014 at LinkedIn, |
||||
Corp., in New York City, New York, USA. |
||||
|
||||
The following persons, listed in alphabetical order, have participated |
||||
with goavro development by contributing code and test cases. |
||||
|
||||
Alan Gardner <alanctgardner@gmail.com> |
||||
Billy Hand <bhand@mediamath.com> |
||||
Christian Blades <christian.blades@careerbuilder.com> |
||||
Corey Scott <corey.scott@gmail.com> |
||||
Darshan Shaligram <scintilla@gmail.com> |
||||
Dylan Wen <hhkbp2@gmail.com> |
||||
Enrico Candino <enrico.candino@gmail.com> |
||||
Fellyn Silliman <fsilliman@linkedin.com> |
||||
James Crasta <jcrasta@underarmour.com> |
||||
Jeff Haynie <jhaynie@gmail.com> |
||||
Joe Roth <joseph_roth@cable.comcast.com> |
||||
Karrick S. McDermott <kmcdermott@linkedin.com> |
||||
Kasey Klipsch <kklipsch@mediamath.com> |
||||
Michael Johnson <mijohnson@linkedin.com> |
||||
Murray Resinski <murray.resinski@octanner.com> |
||||
Nicolas Kaiser <nikai@nikai.net> |
||||
Sebastien Launay <sebastien@opendns.com> |
||||
Thomas Desrosiers <thomasdesr@gmail.com> |
||||
kklipsch <junk@klipsch.net> |
||||
seborama <sebastien.chatal@sainsburys.co.uk> |
||||
|
||||
A big thank you to these persons who provided testing and amazing |
||||
feedback to goavro during its initial implementation: |
||||
|
||||
Dennis Ordanov <dordanov@linkedin.com> |
||||
Thomas Desrosiers <thomasdesr@gmail.com> |
||||
|
||||
Also a big thank you is extended to our supervisors who supported our |
||||
efforts to bring goavro to the open source community: |
||||
|
||||
Greg Leffler <gleffler@linkedin.com> |
||||
Nick Berry <niberry@linkedin.com> |
@ -0,0 +1,21 @@ |
||||
# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. |
||||
|
||||
|
||||
[[projects]] |
||||
branch = "master" |
||||
name = "github.com/golang/snappy" |
||||
packages = ["."] |
||||
revision = "553a641470496b2327abcac10b36396bd98e45c9" |
||||
|
||||
[[projects]] |
||||
name = "gopkg.in/linkedin/goavro.v1" |
||||
packages = ["."] |
||||
revision = "45b9a0062a837ab3214741a91bff89e2a2e4fae6" |
||||
version = "v1.0.5" |
||||
|
||||
[solve-meta] |
||||
analyzer-name = "dep" |
||||
analyzer-version = 1 |
||||
inputs-digest = "3badedae1f576bec3315e8c72e8e390df4ad16cdc4a5750c6468076ff5fba6c3" |
||||
solver-name = "gps-cdcl" |
||||
solver-version = 1 |
@ -0,0 +1,30 @@ |
||||
|
||||
# Gopkg.toml example |
||||
# |
||||
# Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md |
||||
# for detailed Gopkg.toml documentation. |
||||
# |
||||
# required = ["github.com/user/thing/cmd/thing"] |
||||
# ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"] |
||||
# |
||||
# [[constraint]] |
||||
# name = "github.com/user/project" |
||||
# version = "1.0.0" |
||||
# |
||||
# [[constraint]] |
||||
# name = "github.com/user/project2" |
||||
# branch = "dev" |
||||
# source = "github.com/myfork/project2" |
||||
# |
||||
# [[override]] |
||||
# name = "github.com/x/y" |
||||
# version = "2.4.0" |
||||
|
||||
|
||||
[[constraint]] |
||||
branch = "master" |
||||
name = "github.com/golang/snappy" |
||||
|
||||
[[constraint]] |
||||
name = "gopkg.in/linkedin/goavro.v1" |
||||
version = "1.0.5" |
@ -0,0 +1,201 @@ |
||||
Apache License |
||||
Version 2.0, January 2004 |
||||
http://www.apache.org/licenses/ |
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION |
||||
|
||||
1. Definitions. |
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction, |
||||
and distribution as defined by Sections 1 through 9 of this document. |
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by |
||||
the copyright owner that is granting the License. |
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all |
||||
other entities that control, are controlled by, or are under common |
||||
control with that entity. For the purposes of this definition, |
||||
"control" means (i) the power, direct or indirect, to cause the |
||||
direction or management of such entity, whether by contract or |
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the |
||||
outstanding shares, or (iii) beneficial ownership of such entity. |
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity |
||||
exercising permissions granted by this License. |
||||
|
||||
"Source" form shall mean the preferred form for making modifications, |
||||
including but not limited to software source code, documentation |
||||
source, and configuration files. |
||||
|
||||
"Object" form shall mean any form resulting from mechanical |
||||
transformation or translation of a Source form, including but |
||||
not limited to compiled object code, generated documentation, |
||||
and conversions to other media types. |
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or |
||||
Object form, made available under the License, as indicated by a |
||||
copyright notice that is included in or attached to the work |
||||
(an example is provided in the Appendix below). |
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object |
||||
form, that is based on (or derived from) the Work and for which the |
||||
editorial revisions, annotations, elaborations, or other modifications |
||||
represent, as a whole, an original work of authorship. For the purposes |
||||
of this License, Derivative Works shall not include works that remain |
||||
separable from, or merely link (or bind by name) to the interfaces of, |
||||
the Work and Derivative Works thereof. |
||||
|
||||
"Contribution" shall mean any work of authorship, including |
||||
the original version of the Work and any modifications or additions |
||||
to that Work or Derivative Works thereof, that is intentionally |
||||
submitted to Licensor for inclusion in the Work by the copyright owner |
||||
or by an individual or Legal Entity authorized to submit on behalf of |
||||
the copyright owner. For the purposes of this definition, "submitted" |
||||
means any form of electronic, verbal, or written communication sent |
||||
to the Licensor or its representatives, including but not limited to |
||||
communication on electronic mailing lists, source code control systems, |
||||
and issue tracking systems that are managed by, or on behalf of, the |
||||
Licensor for the purpose of discussing and improving the Work, but |
||||
excluding communication that is conspicuously marked or otherwise |
||||
designated in writing by the copyright owner as "Not a Contribution." |
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity |
||||
on behalf of whom a Contribution has been received by Licensor and |
||||
subsequently incorporated within the Work. |
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of |
||||
this License, each Contributor hereby grants to You a perpetual, |
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||
copyright license to reproduce, prepare Derivative Works of, |
||||
publicly display, publicly perform, sublicense, and distribute the |
||||
Work and such Derivative Works in Source or Object form. |
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of |
||||
this License, each Contributor hereby grants to You a perpetual, |
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable |
||||
(except as stated in this section) patent license to make, have made, |
||||
use, offer to sell, sell, import, and otherwise transfer the Work, |
||||
where such license applies only to those patent claims licensable |
||||
by such Contributor that are necessarily infringed by their |
||||
Contribution(s) alone or by combination of their Contribution(s) |
||||
with the Work to which such Contribution(s) was submitted. If You |
||||
institute patent litigation against any entity (including a |
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work |
||||
or a Contribution incorporated within the Work constitutes direct |
||||
or contributory patent infringement, then any patent licenses |
||||
granted to You under this License for that Work shall terminate |
||||
as of the date such litigation is filed. |
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the |
||||
Work or Derivative Works thereof in any medium, with or without |
||||
modifications, and in Source or Object form, provided that You |
||||
meet the following conditions: |
||||
|
||||
(a) You must give any other recipients of the Work or |
||||
Derivative Works a copy of this License; and |
||||
|
||||
(b) You must cause any modified files to carry prominent notices |
||||
stating that You changed the files; and |
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works |
||||
that You distribute, all copyright, patent, trademark, and |
||||
attribution notices from the Source form of the Work, |
||||
excluding those notices that do not pertain to any part of |
||||
the Derivative Works; and |
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its |
||||
distribution, then any Derivative Works that You distribute must |
||||
include a readable copy of the attribution notices contained |
||||
within such NOTICE file, excluding those notices that do not |
||||
pertain to any part of the Derivative Works, in at least one |
||||
of the following places: within a NOTICE text file distributed |
||||
as part of the Derivative Works; within the Source form or |
||||
documentation, if provided along with the Derivative Works; or, |
||||
within a display generated by the Derivative Works, if and |
||||
wherever such third-party notices normally appear. The contents |
||||
of the NOTICE file are for informational purposes only and |
||||
do not modify the License. You may add Your own attribution |
||||
notices within Derivative Works that You distribute, alongside |
||||
or as an addendum to the NOTICE text from the Work, provided |
||||
that such additional attribution notices cannot be construed |
||||
as modifying the License. |
||||
|
||||
You may add Your own copyright statement to Your modifications and |
||||
may provide additional or different license terms and conditions |
||||
for use, reproduction, or distribution of Your modifications, or |
||||
for any such Derivative Works as a whole, provided Your use, |
||||
reproduction, and distribution of the Work otherwise complies with |
||||
the conditions stated in this License. |
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise, |
||||
any Contribution intentionally submitted for inclusion in the Work |
||||
by You to the Licensor shall be under the terms and conditions of |
||||
this License, without any additional terms or conditions. |
||||
Notwithstanding the above, nothing herein shall supersede or modify |
||||
the terms of any separate license agreement you may have executed |
||||
with Licensor regarding such Contributions. |
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade |
||||
names, trademarks, service marks, or product names of the Licensor, |
||||
except as required for reasonable and customary use in describing the |
||||
origin of the Work and reproducing the content of the NOTICE file. |
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or |
||||
agreed to in writing, Licensor provides the Work (and each |
||||
Contributor provides its Contributions) on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
||||
implied, including, without limitation, any warranties or conditions |
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A |
||||
PARTICULAR PURPOSE. You are solely responsible for determining the |
||||
appropriateness of using or redistributing the Work and assume any |
||||
risks associated with Your exercise of permissions under this License. |
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory, |
||||
whether in tort (including negligence), contract, or otherwise, |
||||
unless required by applicable law (such as deliberate and grossly |
||||
negligent acts) or agreed to in writing, shall any Contributor be |
||||
liable to You for damages, including any direct, indirect, special, |
||||
incidental, or consequential damages of any character arising as a |
||||
result of this License or out of the use or inability to use the |
||||
Work (including but not limited to damages for loss of goodwill, |
||||
work stoppage, computer failure or malfunction, or any and all |
||||
other commercial damages or losses), even if such Contributor |
||||
has been advised of the possibility of such damages. |
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing |
||||
the Work or Derivative Works thereof, You may choose to offer, |
||||
and charge a fee for, acceptance of support, warranty, indemnity, |
||||
or other liability obligations and/or rights consistent with this |
||||
License. However, in accepting such obligations, You may act only |
||||
on Your own behalf and on Your sole responsibility, not on behalf |
||||
of any other Contributor, and only if You agree to indemnify, |
||||
defend, and hold each Contributor harmless for any liability |
||||
incurred by, or claims asserted against, such Contributor by reason |
||||
of your accepting any such warranty or additional liability. |
||||
|
||||
END OF TERMS AND CONDITIONS |
||||
|
||||
APPENDIX: How to apply the Apache License to your work. |
||||
|
||||
To apply the Apache License to your work, attach the following |
||||
boilerplate notice, with the fields enclosed by brackets "[]" |
||||
replaced with your own identifying information. (Don't include |
||||
the brackets!) The text should be enclosed in the appropriate |
||||
comment syntax for the file format. We also recommend that a |
||||
file or class name and description of purpose be included on the |
||||
same "printed page" as the copyright notice for easier |
||||
identification within third-party archives. |
||||
|
||||
Copyright [yyyy] [name of copyright owner] |
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); |
||||
you may not use this file except in compliance with the License. |
||||
You may obtain a copy of the License at |
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
Unless required by applicable law or agreed to in writing, software |
||||
distributed under the License is distributed on an "AS IS" BASIS, |
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
See the License for the specific language governing permissions and |
||||
limitations under the License. |
@ -0,0 +1,226 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"fmt" |
||||
"io" |
||||
"math" |
||||
"reflect" |
||||
) |
||||
|
||||
func makeArrayCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) { |
||||
// array type must have items
|
||||
itemSchema, ok := schemaMap["items"] |
||||
if !ok { |
||||
return nil, fmt.Errorf("Array ought to have items key") |
||||
} |
||||
itemCodec, err := buildCodec(st, enclosingNamespace, itemSchema) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("Array items ought to be valid Avro type: %s", err) |
||||
} |
||||
|
||||
return &Codec{ |
||||
typeName: &name{"array", nullNamespace}, |
||||
nativeFromBinary: func(buf []byte) (interface{}, []byte, error) { |
||||
var value interface{} |
||||
var err error |
||||
|
||||
// block count and block size
|
||||
if value, buf, err = longNativeFromBinary(buf); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary array block count: %s", err) |
||||
} |
||||
blockCount := value.(int64) |
||||
if blockCount < 0 { |
||||
// NOTE: A negative block count implies there is a long encoded
|
||||
// block size following the negative block count. We have no use
|
||||
// for the block size in this decoder, so we read and discard
|
||||
// the value.
|
||||
if blockCount == math.MinInt64 { |
||||
// The minimum number for any signed numerical type can never be made positive
|
||||
return nil, nil, fmt.Errorf("cannot decode binary array with block count: %d", math.MinInt64) |
||||
} |
||||
blockCount = -blockCount // convert to its positive equivalent
|
||||
if _, buf, err = longNativeFromBinary(buf); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary array block size: %s", err) |
||||
} |
||||
} |
||||
// Ensure block count does not exceed some sane value.
|
||||
if blockCount > MaxBlockCount { |
||||
return nil, nil, fmt.Errorf("cannot decode binary array when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount) |
||||
} |
||||
// NOTE: While the attempt of a RAM optimization shown below is not
|
||||
// necessary, many encoders will encode all items in a single block.
|
||||
// We can optimize amount of RAM allocated by runtime for the array
|
||||
// by initializing the array for that number of items.
|
||||
arrayValues := make([]interface{}, 0, blockCount) |
||||
|
||||
for blockCount != 0 { |
||||
// Decode `blockCount` datum values from buffer
|
||||
for i := int64(0); i < blockCount; i++ { |
||||
if value, buf, err = itemCodec.nativeFromBinary(buf); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary array item %d: %s", i+1, err) |
||||
} |
||||
arrayValues = append(arrayValues, value) |
||||
} |
||||
// Decode next blockCount from buffer, because there may be more blocks
|
||||
if value, buf, err = longNativeFromBinary(buf); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary array block count: %s", err) |
||||
} |
||||
blockCount = value.(int64) |
||||
if blockCount < 0 { |
||||
// NOTE: A negative block count implies there is a long
|
||||
// encoded block size following the negative block count. We
|
||||
// have no use for the block size in this decoder, so we
|
||||
// read and discard the value.
|
||||
if blockCount == math.MinInt64 { |
||||
// The minimum number for any signed numerical type can
|
||||
// never be made positive
|
||||
return nil, nil, fmt.Errorf("cannot decode binary array with block count: %d", math.MinInt64) |
||||
} |
||||
blockCount = -blockCount // convert to its positive equivalent
|
||||
if _, buf, err = longNativeFromBinary(buf); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary array block size: %s", err) |
||||
} |
||||
} |
||||
// Ensure block count does not exceed some sane value.
|
||||
if blockCount > MaxBlockCount { |
||||
return nil, nil, fmt.Errorf("cannot decode binary array when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount) |
||||
} |
||||
} |
||||
return arrayValues, buf, nil |
||||
}, |
||||
binaryFromNative: func(buf []byte, datum interface{}) ([]byte, error) { |
||||
arrayValues, err := convertArray(datum) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot encode binary array: %s", err) |
||||
} |
||||
|
||||
arrayLength := int64(len(arrayValues)) |
||||
var alreadyEncoded, remainingInBlock int64 |
||||
|
||||
for i, item := range arrayValues { |
||||
if remainingInBlock == 0 { // start a new block
|
||||
remainingInBlock = arrayLength - alreadyEncoded |
||||
if remainingInBlock > MaxBlockCount { |
||||
// limit block count to MacBlockCount
|
||||
remainingInBlock = MaxBlockCount |
||||
} |
||||
buf, _ = longBinaryFromNative(buf, remainingInBlock) |
||||
} |
||||
|
||||
if buf, err = itemCodec.binaryFromNative(buf, item); err != nil { |
||||
return nil, fmt.Errorf("cannot encode binary array item %d: %v: %s", i+1, item, err) |
||||
} |
||||
|
||||
remainingInBlock-- |
||||
alreadyEncoded++ |
||||
} |
||||
|
||||
return longBinaryFromNative(buf, 0) // append trailing 0 block count to signal end of Array
|
||||
}, |
||||
nativeFromTextual: func(buf []byte) (interface{}, []byte, error) { |
||||
var arrayValues []interface{} |
||||
var value interface{} |
||||
var err error |
||||
var b byte |
||||
|
||||
if buf, err = advanceAndConsume(buf, '['); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode textual array: %s", err) |
||||
} |
||||
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 { |
||||
return nil, nil, fmt.Errorf("cannot decode textual array: %s", io.ErrShortBuffer) |
||||
} |
||||
// NOTE: Special case for empty array
|
||||
if buf[0] == ']' { |
||||
return arrayValues, buf[1:], nil |
||||
} |
||||
|
||||
// NOTE: Also terminates when read ']' byte.
|
||||
for len(buf) > 0 { |
||||
// decode value
|
||||
value, buf, err = itemCodec.nativeFromTextual(buf) |
||||
if err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode textual array: %s", err) |
||||
} |
||||
arrayValues = append(arrayValues, value) |
||||
// either comma or closing curly brace
|
||||
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 { |
||||
return nil, nil, fmt.Errorf("cannot decode textual array: %s", io.ErrShortBuffer) |
||||
} |
||||
switch b = buf[0]; b { |
||||
case ']': |
||||
return arrayValues, buf[1:], nil |
||||
case ',': |
||||
// no-op
|
||||
default: |
||||
return nil, nil, fmt.Errorf("cannot decode textual array: expected ',' or ']'; received: %q", b) |
||||
} |
||||
// NOTE: consume comma from above
|
||||
if buf, _ = advanceToNonWhitespace(buf[1:]); len(buf) == 0 { |
||||
return nil, nil, fmt.Errorf("cannot decode textual array: %s", io.ErrShortBuffer) |
||||
} |
||||
} |
||||
return nil, buf, io.ErrShortBuffer |
||||
}, |
||||
textualFromNative: func(buf []byte, datum interface{}) ([]byte, error) { |
||||
arrayValues, err := convertArray(datum) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot encode textual array: %s", err) |
||||
} |
||||
|
||||
var atLeastOne bool |
||||
|
||||
buf = append(buf, '[') |
||||
|
||||
for i, item := range arrayValues { |
||||
atLeastOne = true |
||||
|
||||
// Encode value
|
||||
buf, err = itemCodec.textualFromNative(buf, item) |
||||
if err != nil { |
||||
// field was specified in datum; therefore its value was invalid
|
||||
return nil, fmt.Errorf("cannot encode textual array item %d; %v: %s", i+1, item, err) |
||||
} |
||||
buf = append(buf, ',') |
||||
} |
||||
|
||||
if atLeastOne { |
||||
return append(buf[:len(buf)-1], ']'), nil |
||||
} |
||||
return append(buf, ']'), nil |
||||
}, |
||||
}, nil |
||||
} |
||||
|
||||
// convertArray converts interface{} to []interface{} if possible.
|
||||
func convertArray(datum interface{}) ([]interface{}, error) { |
||||
arrayValues, ok := datum.([]interface{}) |
||||
if ok { |
||||
return arrayValues, nil |
||||
} |
||||
// NOTE: When given a slice of any other type, zip values to
|
||||
// items as a convenience to client.
|
||||
v := reflect.ValueOf(datum) |
||||
if v.Kind() != reflect.Slice { |
||||
return nil, fmt.Errorf("cannot create []interface{}: expected slice; received: %T", datum) |
||||
} |
||||
// NOTE: Two better alternatives to the current algorithm are:
|
||||
// (1) mutate the reflection tuple underneath to convert the
|
||||
// []int, for example, to []interface{}, with O(1) complexity
|
||||
// (2) use copy builtin to zip the data items over with O(n) complexity,
|
||||
// but more efficient than what's below.
|
||||
// Suggestions?
|
||||
arrayValues = make([]interface{}, v.Len()) |
||||
for idx := 0; idx < v.Len(); idx++ { |
||||
arrayValues[idx] = v.Index(idx).Interface() |
||||
} |
||||
return arrayValues, nil |
||||
} |
@ -0,0 +1,160 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"fmt" |
||||
"io" |
||||
"math" |
||||
) |
||||
|
||||
// bytesBinaryReader reads bytes from io.Reader and returns byte slice of
|
||||
// specified size or the error encountered while trying to read those bytes.
|
||||
func bytesBinaryReader(ior io.Reader) ([]byte, error) { |
||||
size, err := longBinaryReader(ior) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot read bytes: cannot read size: %s", err) |
||||
} |
||||
if size < 0 { |
||||
return nil, fmt.Errorf("cannot read bytes: size is negative: %d", size) |
||||
} |
||||
if size > MaxBlockSize { |
||||
return nil, fmt.Errorf("cannot read bytes: size exceeds MaxBlockSize: %d > %d", size, MaxBlockSize) |
||||
} |
||||
buf := make([]byte, size) |
||||
_, err = io.ReadAtLeast(ior, buf, int(size)) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot read bytes: %s", err) |
||||
} |
||||
return buf, nil |
||||
} |
||||
|
||||
// longBinaryReader reads bytes from io.Reader until has complete long value, or
|
||||
// read error.
|
||||
func longBinaryReader(ior io.Reader) (int64, error) { |
||||
var value uint64 |
||||
var shift uint |
||||
var err error |
||||
var b byte |
||||
|
||||
// NOTE: While benchmarks show it's more performant to invoke ReadByte when
|
||||
// available, testing whether a variable's data type implements a particular
|
||||
// method is quite slow too. So perform the test once, and branch to the
|
||||
// appropriate loop based on the results.
|
||||
|
||||
if byteReader, ok := ior.(io.ByteReader); ok { |
||||
for { |
||||
if b, err = byteReader.ReadByte(); err != nil { |
||||
return 0, err // NOTE: must send back unaltered error to detect io.EOF
|
||||
} |
||||
value |= uint64(b&intMask) << shift |
||||
if b&intFlag == 0 { |
||||
return (int64(value>>1) ^ -int64(value&1)), nil |
||||
} |
||||
shift += 7 |
||||
} |
||||
} |
||||
|
||||
// NOTE: ior does not also implement io.ByteReader, so we must allocate a
|
||||
// byte slice with a single byte, and read each byte into the slice.
|
||||
buf := make([]byte, 1) |
||||
for { |
||||
if _, err = ior.Read(buf); err != nil { |
||||
return 0, err // NOTE: must send back unaltered error to detect io.EOF
|
||||
} |
||||
b = buf[0] |
||||
value |= uint64(b&intMask) << shift |
||||
if b&intFlag == 0 { |
||||
return (int64(value>>1) ^ -int64(value&1)), nil |
||||
} |
||||
shift += 7 |
||||
} |
||||
} |
||||
|
||||
// metadataBinaryReader reads bytes from io.Reader until has entire map value,
|
||||
// or read error.
|
||||
func metadataBinaryReader(ior io.Reader) (map[string][]byte, error) { |
||||
var err error |
||||
var value interface{} |
||||
|
||||
// block count and block size
|
||||
if value, err = longBinaryReader(ior); err != nil { |
||||
return nil, fmt.Errorf("cannot read map block count: %s", err) |
||||
} |
||||
blockCount := value.(int64) |
||||
if blockCount < 0 { |
||||
if blockCount == math.MinInt64 { |
||||
// The minimum number for any signed numerical type can never be
|
||||
// made positive
|
||||
return nil, fmt.Errorf("cannot read map with block count: %d", math.MinInt64) |
||||
} |
||||
// NOTE: A negative block count implies there is a long encoded block
|
||||
// size following the negative block count. We have no use for the block
|
||||
// size in this decoder, so we read and discard the value.
|
||||
blockCount = -blockCount // convert to its positive equivalent
|
||||
if _, err = longBinaryReader(ior); err != nil { |
||||
return nil, fmt.Errorf("cannot read map block size: %s", err) |
||||
} |
||||
} |
||||
// Ensure block count does not exceed some sane value.
|
||||
if blockCount > MaxBlockCount { |
||||
return nil, fmt.Errorf("cannot read map when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount) |
||||
} |
||||
// NOTE: While the attempt of a RAM optimization shown below is not
|
||||
// necessary, many encoders will encode all items in a single block. We can
|
||||
// optimize amount of RAM allocated by runtime for the array by initializing
|
||||
// the array for that number of items.
|
||||
mapValues := make(map[string][]byte, blockCount) |
||||
|
||||
for blockCount != 0 { |
||||
// Decode `blockCount` datum values from buffer
|
||||
for i := int64(0); i < blockCount; i++ { |
||||
// first decode the key string
|
||||
keyBytes, err := bytesBinaryReader(ior) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot read map key: %s", err) |
||||
} |
||||
key := string(keyBytes) |
||||
if _, ok := mapValues[key]; ok { |
||||
return nil, fmt.Errorf("cannot read map: duplicate key: %q", key) |
||||
} |
||||
// metadata values are always bytes
|
||||
buf, err := bytesBinaryReader(ior) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot read map value for key %q: %s", key, err) |
||||
} |
||||
mapValues[key] = buf |
||||
} |
||||
// Decode next blockCount from buffer, because there may be more blocks
|
||||
if value, err = longBinaryReader(ior); err != nil { |
||||
return nil, fmt.Errorf("cannot read map block count: %s", err) |
||||
} |
||||
blockCount = value.(int64) |
||||
if blockCount < 0 { |
||||
if blockCount == math.MinInt64 { |
||||
// The minimum number for any signed numerical type can never be
|
||||
// made positive
|
||||
return nil, fmt.Errorf("cannot read map with block count: %d", math.MinInt64) |
||||
} |
||||
// NOTE: A negative block count implies there is a long encoded
|
||||
// block size following the negative block count. We have no use for
|
||||
// the block size in this decoder, so we read and discard the value.
|
||||
blockCount = -blockCount // convert to its positive equivalent
|
||||
if _, err = longBinaryReader(ior); err != nil { |
||||
return nil, fmt.Errorf("cannot read map block size: %s", err) |
||||
} |
||||
} |
||||
// Ensure block count does not exceed some sane value.
|
||||
if blockCount > MaxBlockCount { |
||||
return nil, fmt.Errorf("cannot read map when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount) |
||||
} |
||||
} |
||||
return mapValues, nil |
||||
} |
@ -0,0 +1,72 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"bytes" |
||||
"errors" |
||||
"fmt" |
||||
"io" |
||||
) |
||||
|
||||
func booleanNativeFromBinary(buf []byte) (interface{}, []byte, error) { |
||||
if len(buf) < 1 { |
||||
return nil, nil, io.ErrShortBuffer |
||||
} |
||||
var b byte |
||||
b, buf = buf[0], buf[1:] |
||||
switch b { |
||||
case byte(0): |
||||
return false, buf, nil |
||||
case byte(1): |
||||
return true, buf, nil |
||||
default: |
||||
return nil, nil, fmt.Errorf("cannot decode binary boolean: expected: Go byte(0) or byte(1); received: byte(%d)", b) |
||||
} |
||||
} |
||||
|
||||
func booleanBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
value, ok := datum.(bool) |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode binary boolean: expected: Go bool; received: %T", datum) |
||||
} |
||||
var b byte |
||||
if value { |
||||
b = 1 |
||||
} |
||||
return append(buf, b), nil |
||||
} |
||||
|
||||
func booleanNativeFromTextual(buf []byte) (interface{}, []byte, error) { |
||||
if len(buf) < 4 { |
||||
return nil, nil, fmt.Errorf("cannot decode textual boolean: %s", io.ErrShortBuffer) |
||||
} |
||||
if bytes.Equal(buf[:4], []byte("true")) { |
||||
return true, buf[4:], nil |
||||
} |
||||
if len(buf) < 5 { |
||||
return nil, nil, fmt.Errorf("cannot decode textual boolean: %s", io.ErrShortBuffer) |
||||
} |
||||
if bytes.Equal(buf[:5], []byte("false")) { |
||||
return false, buf[5:], nil |
||||
} |
||||
return nil, nil, errors.New("expected false or true") |
||||
} |
||||
|
||||
func booleanTextualFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
value, ok := datum.(bool) |
||||
if !ok { |
||||
return nil, fmt.Errorf("boolean: expected: Go bool; received: %T", datum) |
||||
} |
||||
if value { |
||||
return append(buf, "true"...), nil |
||||
} |
||||
return append(buf, "false"...), nil |
||||
} |
@ -0,0 +1,369 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"encoding/hex" |
||||
"errors" |
||||
"fmt" |
||||
"io" |
||||
"unicode" |
||||
"unicode/utf16" |
||||
"unicode/utf8" |
||||
) |
||||
|
||||
////////////////////////////////////////
|
||||
// Binary Decode
|
||||
////////////////////////////////////////
|
||||
|
||||
func bytesNativeFromBinary(buf []byte) (interface{}, []byte, error) { |
||||
if len(buf) < 1 { |
||||
return nil, nil, fmt.Errorf("cannot decode binary bytes: %s", io.ErrShortBuffer) |
||||
} |
||||
var decoded interface{} |
||||
var err error |
||||
if decoded, buf, err = longNativeFromBinary(buf); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary bytes: %s", err) |
||||
} |
||||
size := decoded.(int64) // always returns int64
|
||||
if size < 0 { |
||||
return nil, nil, fmt.Errorf("cannot decode binary bytes: negative size: %d", size) |
||||
} |
||||
if size > int64(len(buf)) { |
||||
return nil, nil, fmt.Errorf("cannot decode binary bytes: %s", io.ErrShortBuffer) |
||||
} |
||||
return buf[:size], buf[size:], nil |
||||
} |
||||
|
||||
func stringNativeFromBinary(buf []byte) (interface{}, []byte, error) { |
||||
d, b, err := bytesNativeFromBinary(buf) |
||||
if err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary string: %s", err) |
||||
} |
||||
return string(d.([]byte)), b, nil |
||||
} |
||||
|
||||
////////////////////////////////////////
|
||||
// Binary Encode
|
||||
////////////////////////////////////////
|
||||
|
||||
func bytesBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
var d []byte |
||||
switch datum.(type) { |
||||
case []byte: |
||||
d = datum.([]byte) |
||||
case string: |
||||
d = []byte(datum.(string)) |
||||
default: |
||||
return nil, fmt.Errorf("cannot encode binary bytes: expected: []byte; received: %T", datum) |
||||
} |
||||
buf, _ = longBinaryFromNative(buf, len(d)) // only fails when given non integer
|
||||
return append(buf, d...), nil // append datum bytes
|
||||
} |
||||
|
||||
func stringBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
someBytes, ok := datum.(string) |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode binary bytes: expected: string; received: %T", datum) |
||||
} |
||||
buf, _ = longBinaryFromNative(buf, len(someBytes)) // only fails when given non integer
|
||||
return append(buf, someBytes...), nil // append datum bytes
|
||||
} |
||||
|
||||
////////////////////////////////////////
|
||||
// Text Decode
|
||||
////////////////////////////////////////
|
||||
|
||||
func bytesNativeFromTextual(buf []byte) (interface{}, []byte, error) { |
||||
buflen := len(buf) |
||||
if buflen < 2 { |
||||
return nil, nil, fmt.Errorf("cannot decode textual bytes: %s", io.ErrShortBuffer) |
||||
} |
||||
if buf[0] != '"' { |
||||
return nil, nil, fmt.Errorf("cannot decode textual bytes: expected initial \"; found: %#U", buf[0]) |
||||
} |
||||
var newBytes []byte |
||||
var escaped bool |
||||
// Loop through bytes following initial double quote, but note we will
|
||||
// return immediately when find unescaped double quote.
|
||||
for i := 1; i < buflen; i++ { |
||||
b := buf[i] |
||||
if escaped { |
||||
escaped = false |
||||
if b2, ok := unescapeSpecialJSON(b); ok { |
||||
newBytes = append(newBytes, b2) |
||||
continue |
||||
} |
||||
if b == 'u' { |
||||
// NOTE: Need at least 4 more bytes to read uint16, but subtract
|
||||
// 1 because do not want to count the trailing quote and
|
||||
// subtract another 1 because already consumed u but have yet to
|
||||
// increment i.
|
||||
if i > buflen-6 { |
||||
return nil, nil, fmt.Errorf("cannot decode textual bytes: %s", io.ErrShortBuffer) |
||||
} |
||||
// NOTE: Avro bytes represent binary data, and do not
|
||||
// necessarily represent text. Therefore, Avro bytes are not
|
||||
// encoded in UTF-16. Each \u is followed by 4 hexadecimal
|
||||
// digits, the first and second of which must be 0.
|
||||
v, err := parseUint64FromHexSlice(buf[i+3 : i+5]) |
||||
if err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode textual bytes: %s", err) |
||||
} |
||||
i += 4 // absorb 4 characters: one 'u' and three of the digits
|
||||
newBytes = append(newBytes, byte(v)) |
||||
continue |
||||
} |
||||
newBytes = append(newBytes, b) |
||||
continue |
||||
} |
||||
if b == '\\' { |
||||
escaped = true |
||||
continue |
||||
} |
||||
if b == '"' { |
||||
return newBytes, buf[i+1:], nil |
||||
} |
||||
newBytes = append(newBytes, b) |
||||
} |
||||
return nil, nil, fmt.Errorf("cannot decode textual bytes: expected final \"; found: %#U", buf[buflen-1]) |
||||
} |
||||
|
||||
func stringNativeFromTextual(buf []byte) (interface{}, []byte, error) { |
||||
buflen := len(buf) |
||||
if buflen < 2 { |
||||
return nil, nil, fmt.Errorf("cannot decode textual string: %s", io.ErrShortBuffer) |
||||
} |
||||
if buf[0] != '"' { |
||||
return nil, nil, fmt.Errorf("cannot decode textual string: expected initial \"; found: %#U", buf[0]) |
||||
} |
||||
var newBytes []byte |
||||
var escaped bool |
||||
// Loop through bytes following initial double quote, but note we will
|
||||
// return immediately when find unescaped double quote.
|
||||
for i := 1; i < buflen; i++ { |
||||
b := buf[i] |
||||
if escaped { |
||||
escaped = false |
||||
if b2, ok := unescapeSpecialJSON(b); ok { |
||||
newBytes = append(newBytes, b2) |
||||
continue |
||||
} |
||||
if b == 'u' { |
||||
// NOTE: Need at least 4 more bytes to read uint16, but subtract
|
||||
// 1 because do not want to count the trailing quote and
|
||||
// subtract another 1 because already consumed u but have yet to
|
||||
// increment i.
|
||||
if i > buflen-6 { |
||||
return nil, nil, fmt.Errorf("cannot decode textual string: %s", io.ErrShortBuffer) |
||||
} |
||||
v, err := parseUint64FromHexSlice(buf[i+1 : i+5]) |
||||
if err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode textual string: %s", err) |
||||
} |
||||
i += 4 // absorb 4 characters: one 'u' and three of the digits
|
||||
|
||||
nbl := len(newBytes) |
||||
newBytes = append(newBytes, []byte{0, 0, 0, 0}...) // grow to make room for UTF-8 encoded rune
|
||||
|
||||
r := rune(v) |
||||
if utf16.IsSurrogate(r) { |
||||
i++ // absorb final hexadecimal digit from previous value
|
||||
|
||||
// Expect second half of surrogate pair
|
||||
if i > buflen-6 || buf[i] != '\\' || buf[i+1] != 'u' { |
||||
return nil, nil, errors.New("cannot decode textual string: missing second half of surrogate pair") |
||||
} |
||||
|
||||
v, err = parseUint64FromHexSlice(buf[i+2 : i+6]) |
||||
if err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode textual string: %s", err) |
||||
} |
||||
i += 5 // absorb 5 characters: two for '\u', and 3 of the 4 digits
|
||||
|
||||
// Get code point by combining high and low surrogate bits
|
||||
r = utf16.DecodeRune(r, rune(v)) |
||||
} |
||||
|
||||
width := utf8.EncodeRune(newBytes[nbl:], r) // append UTF-8 encoded version of code point
|
||||
newBytes = newBytes[:nbl+width] // trim off excess bytes
|
||||
continue |
||||
} |
||||
newBytes = append(newBytes, b) |
||||
continue |
||||
} |
||||
if b == '\\' { |
||||
escaped = true |
||||
continue |
||||
} |
||||
if b == '"' { |
||||
return string(newBytes), buf[i+1:], nil |
||||
} |
||||
newBytes = append(newBytes, b) |
||||
} |
||||
return nil, nil, fmt.Errorf("cannot decode textual string: expected final \"; found: %x", buf[buflen-1]) |
||||
} |
||||
|
||||
func parseUint64FromHexSlice(buf []byte) (uint64, error) { |
||||
var value uint64 |
||||
for _, b := range buf { |
||||
diff := uint64(b - '0') |
||||
if diff < 10 { |
||||
value = (value << 4) | diff |
||||
continue |
||||
} |
||||
b10 := b + 10 |
||||
diff = uint64(b10 - 'A') |
||||
if diff < 10 { |
||||
return 0, hex.InvalidByteError(b) |
||||
} |
||||
if diff < 16 { |
||||
value = (value << 4) | diff |
||||
continue |
||||
} |
||||
diff = uint64(b10 - 'a') |
||||
if diff < 10 { |
||||
return 0, hex.InvalidByteError(b) |
||||
} |
||||
if diff < 16 { |
||||
value = (value << 4) | diff |
||||
continue |
||||
} |
||||
return 0, hex.InvalidByteError(b) |
||||
} |
||||
return value, nil |
||||
} |
||||
|
||||
func unescapeSpecialJSON(b byte) (byte, bool) { |
||||
// NOTE: The following 8 special JSON characters must be escaped:
|
||||
switch b { |
||||
case '"', '\\', '/': |
||||
return b, true |
||||
case 'b': |
||||
return '\b', true |
||||
case 'f': |
||||
return '\f', true |
||||
case 'n': |
||||
return '\n', true |
||||
case 'r': |
||||
return '\r', true |
||||
case 't': |
||||
return '\t', true |
||||
} |
||||
return b, false |
||||
} |
||||
|
||||
////////////////////////////////////////
|
||||
// Text Encode
|
||||
////////////////////////////////////////
|
||||
|
||||
func bytesTextualFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
someBytes, ok := datum.([]byte) |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode textual bytes: expected: []byte; received: %T", datum) |
||||
} |
||||
buf = append(buf, '"') // prefix buffer with double quote
|
||||
for _, b := range someBytes { |
||||
if escaped, ok := escapeSpecialJSON(b); ok { |
||||
buf = append(buf, escaped...) |
||||
continue |
||||
} |
||||
if r := rune(b); r < utf8.RuneSelf && unicode.IsPrint(r) { |
||||
buf = append(buf, b) |
||||
continue |
||||
} |
||||
// This Code Point _could_ be encoded as a single byte, however, it's
|
||||
// above standard ASCII range (b > 127), therefore must encode using its
|
||||
// four-byte hexadecimal equivalent, which will always start with the
|
||||
// high byte 00
|
||||
buf = appendUnicodeHex(buf, uint16(b)) |
||||
} |
||||
return append(buf, '"'), nil // postfix buffer with double quote
|
||||
} |
||||
|
||||
func stringTextualFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
someString, ok := datum.(string) |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode textual string: expected: string; received: %T", datum) |
||||
} |
||||
buf = append(buf, '"') // prefix buffer with double quote
|
||||
for _, r := range someString { |
||||
if escaped, ok := escapeSpecialJSON(byte(r)); ok { |
||||
buf = append(buf, escaped...) |
||||
continue |
||||
} |
||||
if r < utf8.RuneSelf && unicode.IsPrint(r) { |
||||
buf = append(buf, byte(r)) |
||||
continue |
||||
} |
||||
// NOTE: Attempt to encode code point as UTF-16 surrogate pair
|
||||
r1, r2 := utf16.EncodeRune(r) |
||||
if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar { |
||||
// code point does require surrogate pair, and thus two uint16 values
|
||||
buf = appendUnicodeHex(buf, uint16(r1)) |
||||
buf = appendUnicodeHex(buf, uint16(r2)) |
||||
continue |
||||
} |
||||
// Code Point does not require surrogate pair.
|
||||
buf = appendUnicodeHex(buf, uint16(r)) |
||||
} |
||||
return append(buf, '"'), nil // postfix buffer with double quote
|
||||
} |
||||
|
||||
func appendUnicodeHex(buf []byte, v uint16) []byte { |
||||
// Start with '\u' prefix:
|
||||
buf = append(buf, sliceUnicode...) |
||||
// And tack on 4 hexadecimal digits:
|
||||
buf = append(buf, hexDigits[(v&0xF000)>>12]) |
||||
buf = append(buf, hexDigits[(v&0xF00)>>8]) |
||||
buf = append(buf, hexDigits[(v&0xF0)>>4]) |
||||
buf = append(buf, hexDigits[(v&0xF)]) |
||||
return buf |
||||
} |
||||
|
||||
const hexDigits = "0123456789ABCDEF" |
||||
|
||||
func escapeSpecialJSON(b byte) ([]byte, bool) { |
||||
// NOTE: The following 8 special JSON characters must be escaped:
|
||||
switch b { |
||||
case '"': |
||||
return sliceQuote, true |
||||
case '\\': |
||||
return sliceBackslash, true |
||||
case '/': |
||||
return sliceSlash, true |
||||
case '\b': |
||||
return sliceBackspace, true |
||||
case '\f': |
||||
return sliceFormfeed, true |
||||
case '\n': |
||||
return sliceNewline, true |
||||
case '\r': |
||||
return sliceCarriageReturn, true |
||||
case '\t': |
||||
return sliceTab, true |
||||
} |
||||
return nil, false |
||||
} |
||||
|
||||
// While slices in Go are never constants, we can initialize them once and reuse
|
||||
// them many times. We define these slices at library load time and reuse them
|
||||
// when encoding JSON.
|
||||
var ( |
||||
sliceQuote = []byte("\\\"") |
||||
sliceBackslash = []byte("\\\\") |
||||
sliceSlash = []byte("\\/") |
||||
sliceBackspace = []byte("\\b") |
||||
sliceFormfeed = []byte("\\f") |
||||
sliceNewline = []byte("\\n") |
||||
sliceCarriageReturn = []byte("\\r") |
||||
sliceTab = []byte("\\t") |
||||
sliceUnicode = []byte("\\u") |
||||
) |
@ -0,0 +1,442 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"encoding/json" |
||||
"fmt" |
||||
"math" |
||||
) |
||||
|
||||
var ( |
||||
// MaxBlockCount is the maximum number of data items allowed in a single
|
||||
// block that will be decoded from a binary stream, whether when reading
|
||||
// blocks to decode an array or a map, or when reading blocks from an OCF
|
||||
// stream. This check is to ensure decoding binary data will not cause the
|
||||
// library to over allocate RAM, potentially creating a denial of service on
|
||||
// the system.
|
||||
//
|
||||
// If a particular application needs to decode binary Avro data that
|
||||
// potentially has more data items in a single block, then this variable may
|
||||
// be modified at your discretion.
|
||||
MaxBlockCount = int64(math.MaxInt32) |
||||
|
||||
// MaxBlockSize is the maximum number of bytes that will be allocated for a
|
||||
// single block of data items when decoding from a binary stream. This check
|
||||
// is to ensure decoding binary data will not cause the library to over
|
||||
// allocate RAM, potentially creating a denial of service on the system.
|
||||
//
|
||||
// If a particular application needs to decode binary Avro data that
|
||||
// potentially has more bytes in a single block, then this variable may be
|
||||
// modified at your discretion.
|
||||
MaxBlockSize = int64(math.MaxInt32) |
||||
) |
||||
|
||||
// Codec supports decoding binary and text Avro data to Go native data types,
|
||||
// and conversely encoding Go native data types to binary or text Avro data. A
|
||||
// Codec is created as a stateless structure that can be safely used in multiple
|
||||
// go routines simultaneously.
|
||||
type Codec struct { |
||||
typeName *name |
||||
schema string |
||||
|
||||
nativeFromTextual func([]byte) (interface{}, []byte, error) |
||||
binaryFromNative func([]byte, interface{}) ([]byte, error) |
||||
nativeFromBinary func([]byte) (interface{}, []byte, error) |
||||
textualFromNative func([]byte, interface{}) ([]byte, error) |
||||
} |
||||
|
||||
func newSymbolTable() map[string]*Codec { |
||||
return map[string]*Codec{ |
||||
"boolean": { |
||||
typeName: &name{"boolean", nullNamespace}, |
||||
binaryFromNative: booleanBinaryFromNative, |
||||
nativeFromBinary: booleanNativeFromBinary, |
||||
nativeFromTextual: booleanNativeFromTextual, |
||||
textualFromNative: booleanTextualFromNative, |
||||
}, |
||||
"bytes": { |
||||
typeName: &name{"bytes", nullNamespace}, |
||||
binaryFromNative: bytesBinaryFromNative, |
||||
nativeFromBinary: bytesNativeFromBinary, |
||||
nativeFromTextual: bytesNativeFromTextual, |
||||
textualFromNative: bytesTextualFromNative, |
||||
}, |
||||
"double": { |
||||
typeName: &name{"double", nullNamespace}, |
||||
binaryFromNative: doubleBinaryFromNative, |
||||
nativeFromBinary: doubleNativeFromBinary, |
||||
nativeFromTextual: doubleNativeFromTextual, |
||||
textualFromNative: doubleTextualFromNative, |
||||
}, |
||||
"float": { |
||||
typeName: &name{"float", nullNamespace}, |
||||
binaryFromNative: floatBinaryFromNative, |
||||
nativeFromBinary: floatNativeFromBinary, |
||||
nativeFromTextual: floatNativeFromTextual, |
||||
textualFromNative: floatTextualFromNative, |
||||
}, |
||||
"int": { |
||||
|
||||
typeName: &name{"int", nullNamespace}, |
||||
binaryFromNative: intBinaryFromNative, |
||||
nativeFromBinary: intNativeFromBinary, |
||||
nativeFromTextual: intNativeFromTextual, |
||||
textualFromNative: intTextualFromNative, |
||||
}, |
||||
"long": { |
||||
typeName: &name{"long", nullNamespace}, |
||||
binaryFromNative: longBinaryFromNative, |
||||
nativeFromBinary: longNativeFromBinary, |
||||
nativeFromTextual: longNativeFromTextual, |
||||
textualFromNative: longTextualFromNative, |
||||
}, |
||||
"null": { |
||||
typeName: &name{"null", nullNamespace}, |
||||
binaryFromNative: nullBinaryFromNative, |
||||
nativeFromBinary: nullNativeFromBinary, |
||||
nativeFromTextual: nullNativeFromTextual, |
||||
textualFromNative: nullTextualFromNative, |
||||
}, |
||||
"string": { |
||||
typeName: &name{"string", nullNamespace}, |
||||
binaryFromNative: stringBinaryFromNative, |
||||
nativeFromBinary: stringNativeFromBinary, |
||||
nativeFromTextual: stringNativeFromTextual, |
||||
textualFromNative: stringTextualFromNative, |
||||
}, |
||||
} |
||||
} |
||||
|
||||
// NewCodec returns a Codec used to translate between a byte slice of either
|
||||
// binary or textual Avro data and native Go data.
|
||||
//
|
||||
// Creating a `Codec` is fast, but ought to be performed exactly once per Avro
|
||||
// schema to process. Once a `Codec` is created, it may be used multiple times
|
||||
// to convert data between native form and binary Avro representation, or
|
||||
// between native form and textual Avro representation.
|
||||
//
|
||||
// A particular `Codec` can work with only one Avro schema. However,
|
||||
// there is no practical limit to how many `Codec`s may be created and
|
||||
// used in a program. Internally a `Codec` is merely a named tuple of
|
||||
// four function pointers, and maintains no runtime state that is mutated
|
||||
// after instantiation. In other words, `Codec`s may be safely used by
|
||||
// many go routines simultaneously, as your program requires.
|
||||
//
|
||||
// codec, err := goavro.NewCodec(`
|
||||
// {
|
||||
// "type": "record",
|
||||
// "name": "LongList",
|
||||
// "fields" : [
|
||||
// {"name": "next", "type": ["null", "LongList"], "default": null}
|
||||
// ]
|
||||
// }`)
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// }
|
||||
func NewCodec(schemaSpecification string) (*Codec, error) { |
||||
// bootstrap a symbol table with primitive type codecs for the new codec
|
||||
st := newSymbolTable() |
||||
|
||||
// NOTE: Some clients might give us unadorned primitive type name for the
|
||||
// schema, e.g., "long". While it is not valid JSON, it is a valid schema.
|
||||
// Provide special handling for primitive type names.
|
||||
if c, ok := st[schemaSpecification]; ok { |
||||
c.schema = schemaSpecification |
||||
return c, nil |
||||
} |
||||
|
||||
// NOTE: At this point, schema should be valid JSON, otherwise it's an error
|
||||
// condition.
|
||||
var schema interface{} |
||||
if err := json.Unmarshal([]byte(schemaSpecification), &schema); err != nil { |
||||
return nil, fmt.Errorf("cannot unmarshal schema JSON: %s", err) |
||||
} |
||||
|
||||
c, err := buildCodec(st, nullNamespace, schema) |
||||
if err == nil { |
||||
// compact schema and save it
|
||||
compact, err := json.Marshal(schema) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot remarshal schema: %s", err) |
||||
} |
||||
c.schema = string(compact) |
||||
} |
||||
return c, err |
||||
} |
||||
|
||||
// BinaryFromNative appends the binary encoded byte slice representation of the
|
||||
// provided native datum value to the provided byte slice
|
||||
// in accordance with the Avro schema supplied when
|
||||
// creating the Codec. It is supplied a byte slice to which to append the binary
|
||||
// encoded data along with the actual data to encode. On success, it returns a
|
||||
// new byte slice with the encoded bytes appended, and a nil error value. On
|
||||
// error, it returns the original byte slice, and the error message.
|
||||
//
|
||||
// func ExampleBinaryFromNative() {
|
||||
// codec, err := goavro.NewCodec(`
|
||||
// {
|
||||
// "type": "record",
|
||||
// "name": "LongList",
|
||||
// "fields" : [
|
||||
// {"name": "next", "type": ["null", "LongList"], "default": null}
|
||||
// ]
|
||||
// }`)
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// }
|
||||
//
|
||||
// // Convert native Go form to binary Avro data
|
||||
// binary, err := codec.BinaryFromNative(nil, map[string]interface{}{
|
||||
// "next": map[string]interface{}{
|
||||
// "LongList": map[string]interface{}{
|
||||
// "next": map[string]interface{}{
|
||||
// "LongList": map[string]interface{}{
|
||||
// // NOTE: May omit fields when using default value
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// })
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// }
|
||||
//
|
||||
// fmt.Printf("%#v", binary)
|
||||
// // Output: []byte{0x2, 0x2, 0x0}
|
||||
// }
|
||||
func (c *Codec) BinaryFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
newBuf, err := c.binaryFromNative(buf, datum) |
||||
if err != nil { |
||||
return buf, err // if error, return original byte slice
|
||||
} |
||||
return newBuf, nil |
||||
} |
||||
|
||||
// NativeFromBinary returns a native datum value from the binary encoded byte
|
||||
// slice in accordance with the Avro schema supplied when creating the Codec. On
|
||||
// success, it returns the decoded datum, along with a new byte slice with the
|
||||
// decoded bytes consumed, and a nil error value. On error, it returns nil for
|
||||
// the datum value, the original byte slice, and the error message.
|
||||
//
|
||||
// func ExampleNativeFromBinary() {
|
||||
// codec, err := goavro.NewCodec(`
|
||||
// {
|
||||
// "type": "record",
|
||||
// "name": "LongList",
|
||||
// "fields" : [
|
||||
// {"name": "next", "type": ["null", "LongList"], "default": null}
|
||||
// ]
|
||||
// }`)
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// }
|
||||
//
|
||||
// // Convert native Go form to binary Avro data
|
||||
// binary := []byte{0x2, 0x2, 0x0}
|
||||
//
|
||||
// native, _, err := codec.NativeFromBinary(binary)
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// }
|
||||
//
|
||||
// fmt.Printf("%v", native)
|
||||
// // Output: map[next:map[LongList:map[next:map[LongList:map[next:<nil>]]]]]
|
||||
// }
|
||||
func (c *Codec) NativeFromBinary(buf []byte) (interface{}, []byte, error) { |
||||
value, newBuf, err := c.nativeFromBinary(buf) |
||||
if err != nil { |
||||
return nil, buf, err // if error, return original byte slice
|
||||
} |
||||
return value, newBuf, nil |
||||
} |
||||
|
||||
// NativeFromTextual converts Avro data in JSON text format from the provided byte
|
||||
// slice to Go native data types in accordance with the Avro schema supplied
|
||||
// when creating the Codec. On success, it returns the decoded datum, along with
|
||||
// a new byte slice with the decoded bytes consumed, and a nil error value. On
|
||||
// error, it returns nil for the datum value, the original byte slice, and the
|
||||
// error message.
|
||||
//
|
||||
// func ExampleNativeFromTextual() {
|
||||
// codec, err := goavro.NewCodec(`
|
||||
// {
|
||||
// "type": "record",
|
||||
// "name": "LongList",
|
||||
// "fields" : [
|
||||
// {"name": "next", "type": ["null", "LongList"], "default": null}
|
||||
// ]
|
||||
// }`)
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// }
|
||||
//
|
||||
// // Convert native Go form to text Avro data
|
||||
// text := []byte(`{"next":{"LongList":{"next":{"LongList":{"next":null}}}}}`)
|
||||
//
|
||||
// native, _, err := codec.NativeFromTextual(text)
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// }
|
||||
//
|
||||
// fmt.Printf("%v", native)
|
||||
// // Output: map[next:map[LongList:map[next:map[LongList:map[next:<nil>]]]]]
|
||||
// }
|
||||
func (c *Codec) NativeFromTextual(buf []byte) (interface{}, []byte, error) { |
||||
value, newBuf, err := c.nativeFromTextual(buf) |
||||
if err != nil { |
||||
return nil, buf, err // if error, return original byte slice
|
||||
} |
||||
return value, newBuf, nil |
||||
} |
||||
|
||||
// TextualFromNative converts Go native data types to Avro data in JSON text format in
|
||||
// accordance with the Avro schema supplied when creating the Codec. It is
|
||||
// supplied a byte slice to which to append the encoded data and the actual data
|
||||
// to encode. On success, it returns a new byte slice with the encoded bytes
|
||||
// appended, and a nil error value. On error, it returns the original byte
|
||||
// slice, and the error message.
|
||||
//
|
||||
// func ExampleTextualFromNative() {
|
||||
// codec, err := goavro.NewCodec(`
|
||||
// {
|
||||
// "type": "record",
|
||||
// "name": "LongList",
|
||||
// "fields" : [
|
||||
// {"name": "next", "type": ["null", "LongList"], "default": null}
|
||||
// ]
|
||||
// }`)
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// }
|
||||
//
|
||||
// // Convert native Go form to text Avro data
|
||||
// text, err := codec.TextualFromNative(nil, map[string]interface{}{
|
||||
// "next": map[string]interface{}{
|
||||
// "LongList": map[string]interface{}{
|
||||
// "next": map[string]interface{}{
|
||||
// "LongList": map[string]interface{}{
|
||||
// // NOTE: May omit fields when using default value
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// },
|
||||
// })
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// }
|
||||
//
|
||||
// fmt.Printf("%s", text)
|
||||
// // Output: {"next":{"LongList":{"next":{"LongList":{"next":null}}}}}
|
||||
// }
|
||||
func (c *Codec) TextualFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
newBuf, err := c.textualFromNative(buf, datum) |
||||
if err != nil { |
||||
return buf, err // if error, return original byte slice
|
||||
} |
||||
return newBuf, nil |
||||
} |
||||
|
||||
// Schema returns the compact schema used to create the Codec.
|
||||
//
|
||||
// func ExampleCodecSchema() {
|
||||
// schema := `{"type":"map","values":{"type":"enum","name":"foo","symbols":["alpha","bravo"]}}`
|
||||
// codec, err := goavro.NewCodec(schema)
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// }
|
||||
// fmt.Println(codec.Schema())
|
||||
// // Output: {"type":"map","values":{"name":"foo","type":"enum","symbols":["alpha","bravo"]}}
|
||||
// }
|
||||
func (c *Codec) Schema() string { |
||||
return c.schema |
||||
} |
||||
|
||||
// convert a schema data structure to a codec, prefixing with specified
|
||||
// namespace
|
||||
func buildCodec(st map[string]*Codec, enclosingNamespace string, schema interface{}) (*Codec, error) { |
||||
switch schemaType := schema.(type) { |
||||
case map[string]interface{}: |
||||
return buildCodecForTypeDescribedByMap(st, enclosingNamespace, schemaType) |
||||
case string: |
||||
return buildCodecForTypeDescribedByString(st, enclosingNamespace, schemaType, nil) |
||||
case []interface{}: |
||||
return buildCodecForTypeDescribedBySlice(st, enclosingNamespace, schemaType) |
||||
default: |
||||
return nil, fmt.Errorf("unknown schema type: %T", schema) |
||||
} |
||||
} |
||||
|
||||
// Reach into the map, grabbing its "type". Use that to create the codec.
|
||||
func buildCodecForTypeDescribedByMap(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) { |
||||
t, ok := schemaMap["type"] |
||||
if !ok { |
||||
return nil, fmt.Errorf("missing type: %v", schemaMap) |
||||
} |
||||
switch v := t.(type) { |
||||
case string: |
||||
// Already defined types may be abbreviated with its string name.
|
||||
// EXAMPLE: "type":"array"
|
||||
// EXAMPLE: "type":"enum"
|
||||
// EXAMPLE: "type":"fixed"
|
||||
// EXAMPLE: "type":"int"
|
||||
// EXAMPLE: "type":"record"
|
||||
// EXAMPLE: "type":"somePreviouslyDefinedCustomTypeString"
|
||||
return buildCodecForTypeDescribedByString(st, enclosingNamespace, v, schemaMap) |
||||
case map[string]interface{}: |
||||
return buildCodecForTypeDescribedByMap(st, enclosingNamespace, v) |
||||
case []interface{}: |
||||
return buildCodecForTypeDescribedBySlice(st, enclosingNamespace, v) |
||||
default: |
||||
return nil, fmt.Errorf("type ought to be either string, map[string]interface{}, or []interface{}; received: %T", t) |
||||
} |
||||
} |
||||
|
||||
func buildCodecForTypeDescribedByString(st map[string]*Codec, enclosingNamespace string, typeName string, schemaMap map[string]interface{}) (*Codec, error) { |
||||
// NOTE: When codec already exists, return it. This includes both primitive
|
||||
// type codecs added in NewCodec, and user-defined types, added while
|
||||
// building the codec.
|
||||
if cd, ok := st[typeName]; ok { |
||||
return cd, nil |
||||
} |
||||
// NOTE: Sometimes schema may abbreviate type name inside a namespace.
|
||||
if enclosingNamespace != "" { |
||||
if cd, ok := st[enclosingNamespace+"."+typeName]; ok { |
||||
return cd, nil |
||||
} |
||||
} |
||||
// There are only a small handful of complex Avro data types.
|
||||
switch typeName { |
||||
case "array": |
||||
return makeArrayCodec(st, enclosingNamespace, schemaMap) |
||||
case "enum": |
||||
return makeEnumCodec(st, enclosingNamespace, schemaMap) |
||||
case "fixed": |
||||
return makeFixedCodec(st, enclosingNamespace, schemaMap) |
||||
case "map": |
||||
return makeMapCodec(st, enclosingNamespace, schemaMap) |
||||
case "record": |
||||
return makeRecordCodec(st, enclosingNamespace, schemaMap) |
||||
default: |
||||
return nil, fmt.Errorf("unknown type name: %q", typeName) |
||||
} |
||||
} |
||||
|
||||
// notion of enclosing namespace changes when record, enum, or fixed create a
|
||||
// new namespace, for child objects.
|
||||
func registerNewCodec(st map[string]*Codec, schemaMap map[string]interface{}, enclosingNamespace string) (*Codec, error) { |
||||
n, err := newNameFromSchemaMap(enclosingNamespace, schemaMap) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
c := &Codec{typeName: n} |
||||
st[n.fullName] = c |
||||
return c, nil |
||||
} |
@ -0,0 +1,68 @@ |
||||
/* |
||||
Package goavro is a library that encodes and decodes Avro data. |
||||
|
||||
Goavro provides methods to encode native Go data into both binary and textual |
||||
JSON Avro data, and methods to decode both binary and textual JSON Avro data to |
||||
native Go data. |
||||
|
||||
Goavro also provides methods to read and write Object Container File (OCF) |
||||
formatted files, and the library contains example programs to read and write OCF |
||||
files. |
||||
|
||||
Usage Example: |
||||
|
||||
package main |
||||
|
||||
import ( |
||||
"fmt" |
||||
|
||||
"github.com/linkedin/goavro" |
||||
) |
||||
|
||||
func main() { |
||||
codec, err := goavro.NewCodec(` |
||||
{ |
||||
"type": "record", |
||||
"name": "LongList", |
||||
"fields" : [ |
||||
{"name": "next", "type": ["null", "LongList"], "default": null} |
||||
] |
||||
}`) |
||||
if err != nil { |
||||
fmt.Println(err) |
||||
} |
||||
|
||||
// NOTE: May omit fields when using default value
|
||||
textual := []byte(`{"next":{"LongList":{}}}`) |
||||
|
||||
// Convert textual Avro data (in Avro JSON format) to native Go form
|
||||
native, _, err := codec.NativeFromTextual(textual) |
||||
if err != nil { |
||||
fmt.Println(err) |
||||
} |
||||
|
||||
// Convert native Go form to binary Avro data
|
||||
binary, err := codec.BinaryFromNative(nil, native) |
||||
if err != nil { |
||||
fmt.Println(err) |
||||
} |
||||
|
||||
// Convert binary Avro data back to native Go form
|
||||
native, _, err = codec.NativeFromBinary(binary) |
||||
if err != nil { |
||||
fmt.Println(err) |
||||
} |
||||
|
||||
// Convert native Go form to textual Avro data
|
||||
textual, err = codec.TextualFromNative(nil, native) |
||||
if err != nil { |
||||
fmt.Println(err) |
||||
} |
||||
|
||||
// NOTE: Textual encoding will show all fields, even those with values that
|
||||
// match their default values
|
||||
fmt.Println(string(textual)) |
||||
// Output: {"next":{"LongList":{"next":null}}}
|
||||
} |
||||
*/ |
||||
package goavro |
@ -0,0 +1,105 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"fmt" |
||||
"io" |
||||
) |
||||
|
||||
// enum does not have child objects, therefore whatever namespace it defines is
|
||||
// just to store its name in the symbol table.
|
||||
func makeEnumCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) { |
||||
c, err := registerNewCodec(st, schemaMap, enclosingNamespace) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("Enum ought to have valid name: %s", err) |
||||
} |
||||
|
||||
// enum type must have symbols
|
||||
s1, ok := schemaMap["symbols"] |
||||
if !ok { |
||||
return nil, fmt.Errorf("Enum %q ought to have symbols key", c.typeName) |
||||
} |
||||
s2, ok := s1.([]interface{}) |
||||
if !ok || len(s2) == 0 { |
||||
return nil, fmt.Errorf("Enum %q symbols ought to be non-empty array of strings: %v", c.typeName, s1) |
||||
} |
||||
symbols := make([]string, len(s2)) |
||||
for i, s := range s2 { |
||||
symbol, ok := s.(string) |
||||
if !ok { |
||||
return nil, fmt.Errorf("Enum %q symbol %d ought to be non-empty string; received: %T", c.typeName, i+1, symbol) |
||||
} |
||||
if err := checkString(symbol); err != nil { |
||||
return nil, fmt.Errorf("Enum %q symbol %d ought to %s", c.typeName, i+1, err) |
||||
} |
||||
symbols[i] = symbol |
||||
} |
||||
|
||||
c.nativeFromBinary = func(buf []byte) (interface{}, []byte, error) { |
||||
var value interface{} |
||||
var err error |
||||
var index int64 |
||||
|
||||
if value, buf, err = longNativeFromBinary(buf); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary enum %q index: %s", c.typeName, err) |
||||
} |
||||
index = value.(int64) |
||||
if index < 0 || index >= int64(len(symbols)) { |
||||
return nil, nil, fmt.Errorf("cannot decode binary enum %q: index ought to be between 0 and %d; read index: %d", c.typeName, len(symbols)-1, index) |
||||
} |
||||
return symbols[index], buf, nil |
||||
} |
||||
c.binaryFromNative = func(buf []byte, datum interface{}) ([]byte, error) { |
||||
someString, ok := datum.(string) |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode binary enum %q: expected string; received: %T", c.typeName, datum) |
||||
} |
||||
for i, symbol := range symbols { |
||||
if symbol == someString { |
||||
return longBinaryFromNative(buf, i) |
||||
} |
||||
} |
||||
return nil, fmt.Errorf("cannot encode binary enum %q: value ought to be member of symbols: %v; %q", c.typeName, symbols, someString) |
||||
} |
||||
c.nativeFromTextual = func(buf []byte) (interface{}, []byte, error) { |
||||
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 { |
||||
return nil, nil, fmt.Errorf("cannot decode textual enum: %s", io.ErrShortBuffer) |
||||
} |
||||
// decode enum string
|
||||
var value interface{} |
||||
var err error |
||||
value, buf, err = stringNativeFromTextual(buf) |
||||
if err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode textual enum: expected key: %s", err) |
||||
} |
||||
someString := value.(string) |
||||
for _, symbol := range symbols { |
||||
if symbol == someString { |
||||
return someString, buf, nil |
||||
} |
||||
} |
||||
return nil, nil, fmt.Errorf("cannot decode textual enum %q: value ought to be member of symbols: %v; %q", c.typeName, symbols, someString) |
||||
} |
||||
c.textualFromNative = func(buf []byte, datum interface{}) ([]byte, error) { |
||||
someString, ok := datum.(string) |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode textual enum %q: expected string; received: %T", c.typeName, datum) |
||||
} |
||||
for _, symbol := range symbols { |
||||
if symbol == someString { |
||||
return stringTextualFromNative(buf, someString) |
||||
} |
||||
} |
||||
return nil, fmt.Errorf("cannot encode textual enum %q: value ought to be member of symbols: %v; %q", c.typeName, symbols, someString) |
||||
} |
||||
|
||||
return c, nil |
||||
} |
@ -0,0 +1,81 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"fmt" |
||||
) |
||||
|
||||
// Fixed does not have child objects, therefore whatever namespace it defines is
|
||||
// just to store its name in the symbol table.
|
||||
func makeFixedCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) { |
||||
c, err := registerNewCodec(st, schemaMap, enclosingNamespace) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("Fixed ought to have valid name: %s", err) |
||||
} |
||||
// Fixed type must have size
|
||||
s1, ok := schemaMap["size"] |
||||
if !ok { |
||||
return nil, fmt.Errorf("Fixed %q ought to have size key", c.typeName) |
||||
} |
||||
s2, ok := s1.(float64) |
||||
if !ok || s2 <= 0 { |
||||
return nil, fmt.Errorf("Fixed %q size ought to be number greater than zero: %v", c.typeName, s1) |
||||
} |
||||
size := uint(s2) |
||||
|
||||
c.nativeFromBinary = func(buf []byte) (interface{}, []byte, error) { |
||||
if buflen := uint(len(buf)); size > buflen { |
||||
return nil, nil, fmt.Errorf("cannot decode binary fixed %q: schema size exceeds remaining buffer size: %d > %d (short buffer)", c.typeName, size, buflen) |
||||
} |
||||
return buf[:size], buf[size:], nil |
||||
} |
||||
|
||||
c.binaryFromNative = func(buf []byte, datum interface{}) ([]byte, error) { |
||||
someBytes, ok := datum.([]byte) |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode binary fixed %q: expected []byte; received: %T", c.typeName, datum) |
||||
} |
||||
if count := uint(len(someBytes)); count != size { |
||||
return nil, fmt.Errorf("cannot encode binary fixed %q: datum size ought to equal schema size: %d != %d", c.typeName, count, size) |
||||
} |
||||
return append(buf, someBytes...), nil |
||||
} |
||||
|
||||
c.nativeFromTextual = func(buf []byte) (interface{}, []byte, error) { |
||||
if buflen := uint(len(buf)); size > buflen { |
||||
return nil, nil, fmt.Errorf("cannot decode textual fixed %q: schema size exceeds remaining buffer size: %d > %d (short buffer)", c.typeName, size, buflen) |
||||
} |
||||
var datum interface{} |
||||
var err error |
||||
datum, buf, err = bytesNativeFromTextual(buf) |
||||
if err != nil { |
||||
return nil, buf, err |
||||
} |
||||
datumBytes := datum.([]byte) |
||||
if count := uint(len(datumBytes)); count != size { |
||||
return nil, nil, fmt.Errorf("cannot decode textual fixed %q: datum size ought to equal schema size: %d != %d", c.typeName, count, size) |
||||
} |
||||
return datum, buf, err |
||||
} |
||||
|
||||
c.textualFromNative = func(buf []byte, datum interface{}) ([]byte, error) { |
||||
someBytes, ok := datum.([]byte) |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode textual fixed %q: expected []byte; received: %T", c.typeName, datum) |
||||
} |
||||
if count := uint(len(someBytes)); count != size { |
||||
return nil, fmt.Errorf("cannot encode textual fixed %q: datum size ought to equal schema size: %d != %d", c.typeName, count, size) |
||||
} |
||||
return bytesTextualFromNative(buf, someBytes) |
||||
} |
||||
|
||||
return c, nil |
||||
} |
@ -0,0 +1,293 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"bytes" |
||||
"encoding/binary" |
||||
"fmt" |
||||
"io" |
||||
"math" |
||||
"strconv" |
||||
) |
||||
|
||||
const ( |
||||
doubleEncodedLength = 8 // double requires 8 bytes
|
||||
floatEncodedLength = 4 // float requires 4 bytes
|
||||
) |
||||
|
||||
////////////////////////////////////////
|
||||
// Binary Decode
|
||||
////////////////////////////////////////
|
||||
|
||||
func doubleNativeFromBinary(buf []byte) (interface{}, []byte, error) { |
||||
if len(buf) < doubleEncodedLength { |
||||
return nil, nil, fmt.Errorf("cannot decode binary double: %s", io.ErrShortBuffer) |
||||
} |
||||
return math.Float64frombits(binary.LittleEndian.Uint64(buf[:doubleEncodedLength])), buf[doubleEncodedLength:], nil |
||||
} |
||||
|
||||
func floatNativeFromBinary(buf []byte) (interface{}, []byte, error) { |
||||
if len(buf) < floatEncodedLength { |
||||
return nil, nil, fmt.Errorf("cannot decode binary float: %s", io.ErrShortBuffer) |
||||
} |
||||
return math.Float32frombits(binary.LittleEndian.Uint32(buf[:floatEncodedLength])), buf[floatEncodedLength:], nil |
||||
} |
||||
|
||||
////////////////////////////////////////
|
||||
// Binary Encode
|
||||
////////////////////////////////////////
|
||||
|
||||
func doubleBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
var value float64 |
||||
switch v := datum.(type) { |
||||
case float64: |
||||
value = v |
||||
case float32: |
||||
value = float64(v) |
||||
case int: |
||||
if value = float64(v); int(value) != v { |
||||
return nil, fmt.Errorf("cannot encode binary double: provided Go int would lose precision: %d", v) |
||||
} |
||||
case int64: |
||||
if value = float64(v); int64(value) != v { |
||||
return nil, fmt.Errorf("cannot encode binary double: provided Go int64 would lose precision: %d", v) |
||||
} |
||||
case int32: |
||||
if value = float64(v); int32(value) != v { |
||||
return nil, fmt.Errorf("cannot encode binary double: provided Go int32 would lose precision: %d", v) |
||||
} |
||||
default: |
||||
return nil, fmt.Errorf("cannot encode binary double: expected: Go numeric; received: %T", datum) |
||||
} |
||||
buf = append(buf, 0, 0, 0, 0, 0, 0, 0, 0) |
||||
binary.LittleEndian.PutUint64(buf[len(buf)-doubleEncodedLength:], math.Float64bits(value)) |
||||
return buf, nil |
||||
} |
||||
|
||||
func floatBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
var value float32 |
||||
switch v := datum.(type) { |
||||
case float32: |
||||
value = v |
||||
case float64: |
||||
// Assume runtime can cast special floats correctly, and if there is a
|
||||
// loss of precision from float64 and float32, that should be expected
|
||||
// or at least understood by the client.
|
||||
value = float32(v) |
||||
case int: |
||||
if value = float32(v); int(value) != v { |
||||
return nil, fmt.Errorf("cannot encode binary float: provided Go int would lose precision: %d", v) |
||||
} |
||||
case int64: |
||||
if value = float32(v); int64(value) != v { |
||||
return nil, fmt.Errorf("cannot encode binary float: provided Go int64 would lose precision: %d", v) |
||||
} |
||||
case int32: |
||||
if value = float32(v); int32(value) != v { |
||||
return nil, fmt.Errorf("cannot encode binary float: provided Go int32 would lose precision: %d", v) |
||||
} |
||||
default: |
||||
return nil, fmt.Errorf("cannot encode binary float: expected: Go numeric; received: %T", datum) |
||||
} |
||||
// return floatingBinaryEncoder(buf, uint64(math.Float32bits(value)), floatEncodedLength)
|
||||
buf = append(buf, 0, 0, 0, 0) |
||||
binary.LittleEndian.PutUint32(buf[len(buf)-floatEncodedLength:], uint32(math.Float32bits(value))) |
||||
return buf, nil |
||||
} |
||||
|
||||
////////////////////////////////////////
|
||||
// Text Decode
|
||||
////////////////////////////////////////
|
||||
|
||||
func doubleNativeFromTextual(buf []byte) (interface{}, []byte, error) { |
||||
return floatingTextDecoder(buf, 64) |
||||
} |
||||
|
||||
func floatNativeFromTextual(buf []byte) (interface{}, []byte, error) { |
||||
return floatingTextDecoder(buf, 32) |
||||
} |
||||
|
||||
func floatingTextDecoder(buf []byte, bitSize int) (interface{}, []byte, error) { |
||||
buflen := len(buf) |
||||
if buflen >= 4 { |
||||
if bytes.Equal(buf[:4], []byte("null")) { |
||||
return math.NaN(), buf[4:], nil |
||||
} |
||||
if buflen >= 5 { |
||||
if bytes.Equal(buf[:5], []byte("1e999")) { |
||||
return math.Inf(1), buf[5:], nil |
||||
} |
||||
if buflen >= 6 { |
||||
if bytes.Equal(buf[:6], []byte("-1e999")) { |
||||
return math.Inf(-1), buf[6:], nil |
||||
} |
||||
} |
||||
} |
||||
} |
||||
index, err := numberLength(buf, true) // NOTE: floatAllowed = true
|
||||
if err != nil { |
||||
return nil, nil, err |
||||
} |
||||
datum, err := strconv.ParseFloat(string(buf[:index]), bitSize) |
||||
if err != nil { |
||||
return nil, nil, err |
||||
} |
||||
return datum, buf[index:], nil |
||||
} |
||||
|
||||
func numberLength(buf []byte, floatAllowed bool) (int, error) { |
||||
// ALGORITHM: increment index as long as bytes are valid for number state engine.
|
||||
var index, buflen, count int |
||||
var b byte |
||||
|
||||
// STATE 0: begin, optional: -
|
||||
if buflen = len(buf); index == buflen { |
||||
return 0, io.ErrShortBuffer |
||||
} |
||||
if buf[index] == '-' { |
||||
if index++; index == buflen { |
||||
return 0, io.ErrShortBuffer |
||||
} |
||||
} |
||||
// STATE 1: if 0, goto 2; otherwise if 1-9, goto 3; otherwise bail
|
||||
if b = buf[index]; b == '0' { |
||||
if index++; index == buflen { |
||||
return index, nil // valid number
|
||||
} |
||||
} else if b >= '1' && b <= '9' { |
||||
if index++; index == buflen { |
||||
return index, nil // valid number
|
||||
} |
||||
// STATE 3: absorb zero or more digits
|
||||
for { |
||||
if b = buf[index]; b < '0' || b > '9' { |
||||
break |
||||
} |
||||
if index++; index == buflen { |
||||
return index, nil // valid number
|
||||
} |
||||
} |
||||
} else { |
||||
return 0, fmt.Errorf("unexpected byte: %q", b) |
||||
} |
||||
if floatAllowed { |
||||
// STATE 2: if ., goto 4; otherwise goto 5
|
||||
if buf[index] == '.' { |
||||
if index++; index == buflen { |
||||
return 0, io.ErrShortBuffer |
||||
} |
||||
// STATE 4: absorb one or more digits
|
||||
for { |
||||
if b = buf[index]; b < '0' || b > '9' { |
||||
break |
||||
} |
||||
count++ |
||||
if index++; index == buflen { |
||||
return index, nil // valid number
|
||||
} |
||||
} |
||||
if count == 0 { |
||||
// did not get at least one digit
|
||||
return 0, fmt.Errorf("unexpected byte: %q", b) |
||||
} |
||||
} |
||||
// STATE 5: if e|e, goto 6; otherwise goto 7
|
||||
if b = buf[index]; b == 'e' || b == 'E' { |
||||
if index++; index == buflen { |
||||
return 0, io.ErrShortBuffer |
||||
} |
||||
// STATE 6: if -|+, goto 8; otherwise goto 8
|
||||
if b = buf[index]; b == '+' || b == '-' { |
||||
if index++; index == buflen { |
||||
return 0, io.ErrShortBuffer |
||||
} |
||||
} |
||||
// STATE 8: absorb one or more digits
|
||||
count = 0 |
||||
for { |
||||
if b = buf[index]; b < '0' || b > '9' { |
||||
break |
||||
} |
||||
count++ |
||||
if index++; index == buflen { |
||||
return index, nil // valid number
|
||||
} |
||||
} |
||||
if count == 0 { |
||||
// did not get at least one digit
|
||||
return 0, fmt.Errorf("unexpected byte: %q", b) |
||||
} |
||||
} |
||||
} |
||||
// STATE 7: end
|
||||
return index, nil |
||||
} |
||||
|
||||
////////////////////////////////////////
|
||||
// Text Encode
|
||||
////////////////////////////////////////
|
||||
|
||||
func floatTextualFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
return floatingTextEncoder(buf, datum, 32) |
||||
} |
||||
|
||||
func doubleTextualFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
return floatingTextEncoder(buf, datum, 64) |
||||
} |
||||
|
||||
func floatingTextEncoder(buf []byte, datum interface{}, bitSize int) ([]byte, error) { |
||||
var isFloat bool |
||||
var someFloat64 float64 |
||||
var someInt64 int64 |
||||
switch v := datum.(type) { |
||||
case float32: |
||||
isFloat = true |
||||
someFloat64 = float64(v) |
||||
case float64: |
||||
isFloat = true |
||||
someFloat64 = v |
||||
case int: |
||||
if someInt64 = int64(v); int(someInt64) != v { |
||||
if bitSize == 64 { |
||||
return nil, fmt.Errorf("cannot encode textual double: provided Go int would lose precision: %d", v) |
||||
} |
||||
return nil, fmt.Errorf("cannot encode textual float: provided Go int would lose precision: %d", v) |
||||
} |
||||
case int64: |
||||
someInt64 = v |
||||
case int32: |
||||
if someInt64 = int64(v); int32(someInt64) != v { |
||||
if bitSize == 64 { |
||||
return nil, fmt.Errorf("cannot encode textual double: provided Go int32 would lose precision: %d", v) |
||||
} |
||||
return nil, fmt.Errorf("cannot encode textual float: provided Go int32 would lose precision: %d", v) |
||||
} |
||||
default: |
||||
if bitSize == 64 { |
||||
return nil, fmt.Errorf("cannot encode textual double: expected: Go numeric; received: %T", datum) |
||||
} |
||||
return nil, fmt.Errorf("cannot encode textual float: expected: Go numeric; received: %T", datum) |
||||
} |
||||
|
||||
if isFloat { |
||||
if math.IsNaN(someFloat64) { |
||||
return append(buf, "null"...), nil |
||||
} |
||||
if math.IsInf(someFloat64, 1) { |
||||
return append(buf, "1e999"...), nil |
||||
} |
||||
if math.IsInf(someFloat64, -1) { |
||||
return append(buf, "-1e999"...), nil |
||||
} |
||||
return strconv.AppendFloat(buf, someFloat64, 'g', -1, bitSize), nil |
||||
} |
||||
return strconv.AppendInt(buf, someInt64, 10), nil |
||||
} |
@ -0,0 +1,199 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"fmt" |
||||
"io" |
||||
"strconv" |
||||
) |
||||
|
||||
const ( |
||||
intDownShift = uint32(31) |
||||
intFlag = byte(128) |
||||
intMask = byte(127) |
||||
longDownShift = uint32(63) |
||||
) |
||||
|
||||
////////////////////////////////////////
|
||||
// Binary Decode
|
||||
////////////////////////////////////////
|
||||
|
||||
func intNativeFromBinary(buf []byte) (interface{}, []byte, error) { |
||||
var offset, value int |
||||
var shift uint |
||||
for offset = 0; offset < len(buf); offset++ { |
||||
b := buf[offset] |
||||
value |= int(b&intMask) << shift |
||||
if b&intFlag == 0 { |
||||
return (int32(value>>1) ^ -int32(value&1)), buf[offset+1:], nil |
||||
} |
||||
shift += 7 |
||||
} |
||||
return nil, nil, io.ErrShortBuffer |
||||
} |
||||
|
||||
func longNativeFromBinary(buf []byte) (interface{}, []byte, error) { |
||||
var offset int |
||||
var value uint64 |
||||
var shift uint |
||||
for offset = 0; offset < len(buf); offset++ { |
||||
b := buf[offset] |
||||
value |= uint64(b&intMask) << shift |
||||
if b&intFlag == 0 { |
||||
return (int64(value>>1) ^ -int64(value&1)), buf[offset+1:], nil |
||||
} |
||||
shift += 7 |
||||
} |
||||
return nil, nil, io.ErrShortBuffer |
||||
} |
||||
|
||||
////////////////////////////////////////
|
||||
// Binary Encode
|
||||
////////////////////////////////////////
|
||||
|
||||
func intBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
var value int32 |
||||
switch v := datum.(type) { |
||||
case int32: |
||||
value = v |
||||
case int: |
||||
if value = int32(v); int(value) != v { |
||||
return nil, fmt.Errorf("cannot encode binary int: provided Go int would lose precision: %d", v) |
||||
} |
||||
case int64: |
||||
if value = int32(v); int64(value) != v { |
||||
return nil, fmt.Errorf("cannot encode binary int: provided Go int64 would lose precision: %d", v) |
||||
} |
||||
case float64: |
||||
if value = int32(v); float64(value) != v { |
||||
return nil, fmt.Errorf("cannot encode binary int: provided Go float64 would lose precision: %f", v) |
||||
} |
||||
case float32: |
||||
if value = int32(v); float32(value) != v { |
||||
return nil, fmt.Errorf("cannot encode binary int: provided Go float32 would lose precision: %f", v) |
||||
} |
||||
default: |
||||
return nil, fmt.Errorf("cannot encode binary int: expected: Go numeric; received: %T", datum) |
||||
} |
||||
encoded := uint64((uint32(value) << 1) ^ uint32(value>>intDownShift)) |
||||
return integerBinaryEncoder(buf, encoded) |
||||
} |
||||
|
||||
func longBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
var value int64 |
||||
switch v := datum.(type) { |
||||
case int64: |
||||
value = v |
||||
case int: |
||||
value = int64(v) |
||||
case int32: |
||||
value = int64(v) |
||||
case float64: |
||||
if value = int64(v); float64(value) != v { |
||||
return nil, fmt.Errorf("cannot encode binary long: provided Go float64 would lose precision: %f", v) |
||||
} |
||||
case float32: |
||||
if value = int64(v); float32(value) != v { |
||||
return nil, fmt.Errorf("cannot encode binary long: provided Go float32 would lose precision: %f", v) |
||||
} |
||||
default: |
||||
return nil, fmt.Errorf("long: expected: Go numeric; received: %T", datum) |
||||
} |
||||
encoded := (uint64(value) << 1) ^ uint64(value>>longDownShift) |
||||
return integerBinaryEncoder(buf, encoded) |
||||
} |
||||
|
||||
func integerBinaryEncoder(buf []byte, encoded uint64) ([]byte, error) { |
||||
// used by both intBinaryEncoder and longBinaryEncoder
|
||||
if encoded == 0 { |
||||
return append(buf, 0), nil |
||||
} |
||||
for encoded > 0 { |
||||
b := byte(encoded) & intMask |
||||
encoded = encoded >> 7 |
||||
if encoded != 0 { |
||||
b |= intFlag // set high bit; we have more bytes
|
||||
} |
||||
buf = append(buf, b) |
||||
} |
||||
return buf, nil |
||||
} |
||||
|
||||
////////////////////////////////////////
|
||||
// Text Decode
|
||||
////////////////////////////////////////
|
||||
|
||||
func longNativeFromTextual(buf []byte) (interface{}, []byte, error) { |
||||
return integerTextDecoder(buf, 64) |
||||
} |
||||
|
||||
func intNativeFromTextual(buf []byte) (interface{}, []byte, error) { |
||||
return integerTextDecoder(buf, 32) |
||||
} |
||||
|
||||
func integerTextDecoder(buf []byte, bitSize int) (interface{}, []byte, error) { |
||||
index, err := numberLength(buf, false) // NOTE: floatAllowed = false
|
||||
if err != nil { |
||||
return nil, nil, err |
||||
} |
||||
datum, err := strconv.ParseInt(string(buf[:index]), 10, bitSize) |
||||
if err != nil { |
||||
return nil, nil, err |
||||
} |
||||
if bitSize == 32 { |
||||
return int32(datum), buf[index:], nil |
||||
} |
||||
return datum, buf[index:], nil |
||||
} |
||||
|
||||
////////////////////////////////////////
|
||||
// Text Encode
|
||||
////////////////////////////////////////
|
||||
|
||||
func longTextualFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
return integerTextEncoder(buf, datum, 64) |
||||
} |
||||
|
||||
func intTextualFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
return integerTextEncoder(buf, datum, 32) |
||||
} |
||||
|
||||
func integerTextEncoder(buf []byte, datum interface{}, bitSize int) ([]byte, error) { |
||||
var someInt64 int64 |
||||
switch v := datum.(type) { |
||||
case int: |
||||
someInt64 = int64(v) |
||||
case int32: |
||||
someInt64 = int64(v) |
||||
case int64: |
||||
someInt64 = v |
||||
case float32: |
||||
if someInt64 = int64(v); float32(someInt64) != v { |
||||
if bitSize == 64 { |
||||
return nil, fmt.Errorf("cannot encode textual long: provided Go float32 would lose precision: %f", v) |
||||
} |
||||
return nil, fmt.Errorf("cannot encode textual int: provided Go float32 would lose precision: %f", v) |
||||
} |
||||
case float64: |
||||
if someInt64 = int64(v); float64(someInt64) != v { |
||||
if bitSize == 64 { |
||||
return nil, fmt.Errorf("cannot encode textual long: provided Go float64 would lose precision: %f", v) |
||||
} |
||||
return nil, fmt.Errorf("cannot encode textual int: provided Go float64 would lose precision: %f", v) |
||||
} |
||||
default: |
||||
if bitSize == 64 { |
||||
return nil, fmt.Errorf("cannot encode textual long: expected: Go numeric; received: %T", datum) |
||||
} |
||||
return nil, fmt.Errorf("cannot encode textual int: expected: Go numeric; received: %T", datum) |
||||
} |
||||
return strconv.AppendInt(buf, someInt64, 10), nil |
||||
} |
@ -0,0 +1,307 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"errors" |
||||
"fmt" |
||||
"io" |
||||
"math" |
||||
"reflect" |
||||
) |
||||
|
||||
func makeMapCodec(st map[string]*Codec, namespace string, schemaMap map[string]interface{}) (*Codec, error) { |
||||
// map type must have values
|
||||
valueSchema, ok := schemaMap["values"] |
||||
if !ok { |
||||
return nil, errors.New("Map ought to have values key") |
||||
} |
||||
valueCodec, err := buildCodec(st, namespace, valueSchema) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("Map values ought to be valid Avro type: %s", err) |
||||
} |
||||
|
||||
return &Codec{ |
||||
typeName: &name{"map", nullNamespace}, |
||||
nativeFromBinary: func(buf []byte) (interface{}, []byte, error) { |
||||
var err error |
||||
var value interface{} |
||||
|
||||
// block count and block size
|
||||
if value, buf, err = longNativeFromBinary(buf); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary map block count: %s", err) |
||||
} |
||||
blockCount := value.(int64) |
||||
if blockCount < 0 { |
||||
// NOTE: A negative block count implies there is a long encoded
|
||||
// block size following the negative block count. We have no use
|
||||
// for the block size in this decoder, so we read and discard
|
||||
// the value.
|
||||
if blockCount == math.MinInt64 { |
||||
// The minimum number for any signed numerical type can
|
||||
// never be made positive
|
||||
return nil, nil, fmt.Errorf("cannot decode binary map with block count: %d", math.MinInt64) |
||||
} |
||||
blockCount = -blockCount // convert to its positive equivalent
|
||||
if _, buf, err = longNativeFromBinary(buf); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary map block size: %s", err) |
||||
} |
||||
} |
||||
// Ensure block count does not exceed some sane value.
|
||||
if blockCount > MaxBlockCount { |
||||
return nil, nil, fmt.Errorf("cannot decode binary map when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount) |
||||
} |
||||
// NOTE: While the attempt of a RAM optimization shown below is not
|
||||
// necessary, many encoders will encode all items in a single block.
|
||||
// We can optimize amount of RAM allocated by runtime for the array
|
||||
// by initializing the array for that number of items.
|
||||
mapValues := make(map[string]interface{}, blockCount) |
||||
|
||||
for blockCount != 0 { |
||||
// Decode `blockCount` datum values from buffer
|
||||
for i := int64(0); i < blockCount; i++ { |
||||
// first decode the key string
|
||||
if value, buf, err = stringNativeFromBinary(buf); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary map key: %s", err) |
||||
} |
||||
key := value.(string) // string decoder always returns a string
|
||||
if _, ok := mapValues[key]; ok { |
||||
return nil, nil, fmt.Errorf("cannot decode binary map: duplicate key: %q", key) |
||||
} |
||||
// then decode the value
|
||||
if value, buf, err = valueCodec.nativeFromBinary(buf); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary map value for key %q: %s", key, err) |
||||
} |
||||
mapValues[key] = value |
||||
} |
||||
// Decode next blockCount from buffer, because there may be more blocks
|
||||
if value, buf, err = longNativeFromBinary(buf); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary map block count: %s", err) |
||||
} |
||||
blockCount = value.(int64) |
||||
if blockCount < 0 { |
||||
// NOTE: A negative block count implies there is a long
|
||||
// encoded block size following the negative block count. We
|
||||
// have no use for the block size in this decoder, so we
|
||||
// read and discard the value.
|
||||
if blockCount == math.MinInt64 { |
||||
// The minimum number for any signed numerical type can
|
||||
// never be made positive
|
||||
return nil, nil, fmt.Errorf("cannot decode binary map with block count: %d", math.MinInt64) |
||||
} |
||||
blockCount = -blockCount // convert to its positive equivalent
|
||||
if _, buf, err = longNativeFromBinary(buf); err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary map block size: %s", err) |
||||
} |
||||
} |
||||
// Ensure block count does not exceed some sane value.
|
||||
if blockCount > MaxBlockCount { |
||||
return nil, nil, fmt.Errorf("cannot decode binary map when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount) |
||||
} |
||||
} |
||||
return mapValues, buf, nil |
||||
}, |
||||
binaryFromNative: func(buf []byte, datum interface{}) ([]byte, error) { |
||||
mapValues, err := convertMap(datum) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot encode binary map: %s", err) |
||||
} |
||||
|
||||
keyCount := int64(len(mapValues)) |
||||
var alreadyEncoded, remainingInBlock int64 |
||||
|
||||
for k, v := range mapValues { |
||||
if remainingInBlock == 0 { // start a new block
|
||||
remainingInBlock = keyCount - alreadyEncoded |
||||
if remainingInBlock > MaxBlockCount { |
||||
// limit block count to MacBlockCount
|
||||
remainingInBlock = MaxBlockCount |
||||
} |
||||
buf, _ = longBinaryFromNative(buf, remainingInBlock) |
||||
} |
||||
|
||||
// only fails when given non string, so elide error checking
|
||||
buf, _ = stringBinaryFromNative(buf, k) |
||||
|
||||
// encode the value
|
||||
if buf, err = valueCodec.binaryFromNative(buf, v); err != nil { |
||||
return nil, fmt.Errorf("cannot encode binary map value for key %q: %v: %s", k, v, err) |
||||
} |
||||
|
||||
remainingInBlock-- |
||||
alreadyEncoded++ |
||||
} |
||||
return longBinaryFromNative(buf, 0) // append tailing 0 block count to signal end of Map
|
||||
}, |
||||
nativeFromTextual: func(buf []byte) (interface{}, []byte, error) { |
||||
return genericMapTextDecoder(buf, valueCodec, nil) // codecFromKey == nil
|
||||
}, |
||||
textualFromNative: func(buf []byte, datum interface{}) ([]byte, error) { |
||||
return genericMapTextEncoder(buf, datum, valueCodec, nil) |
||||
}, |
||||
}, nil |
||||
} |
||||
|
||||
// genericMapTextDecoder decodes a JSON text blob to a native Go map, using the
|
||||
// codecs from codecFromKey, and if a key is not found in that map, from
|
||||
// defaultCodec if provided. If defaultCodec is nil, this function returns an
|
||||
// error if it encounters a map key that is not present in codecFromKey. If
|
||||
// codecFromKey is nil, every map value will be decoded using defaultCodec, if
|
||||
// possible.
|
||||
func genericMapTextDecoder(buf []byte, defaultCodec *Codec, codecFromKey map[string]*Codec) (map[string]interface{}, []byte, error) { |
||||
var value interface{} |
||||
var err error |
||||
var b byte |
||||
|
||||
lencodec := len(codecFromKey) |
||||
mapValues := make(map[string]interface{}, lencodec) |
||||
|
||||
if buf, err = advanceAndConsume(buf, '{'); err != nil { |
||||
return nil, nil, err |
||||
} |
||||
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 { |
||||
return nil, nil, io.ErrShortBuffer |
||||
} |
||||
// NOTE: Special case empty map
|
||||
if buf[0] == '}' { |
||||
return mapValues, buf[1:], nil |
||||
} |
||||
|
||||
// NOTE: Also terminates when read '}' byte.
|
||||
for len(buf) > 0 { |
||||
// decode key string
|
||||
value, buf, err = stringNativeFromTextual(buf) |
||||
if err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode textual map: expected key: %s", err) |
||||
} |
||||
key := value.(string) |
||||
// Is key already used?
|
||||
if _, ok := mapValues[key]; ok { |
||||
return nil, nil, fmt.Errorf("cannot decode textual map: duplicate key: %q", key) |
||||
} |
||||
// Find a codec for the key
|
||||
fieldCodec := codecFromKey[key] |
||||
if fieldCodec == nil { |
||||
fieldCodec = defaultCodec |
||||
} |
||||
if fieldCodec == nil { |
||||
return nil, nil, fmt.Errorf("cannot decode textual map: cannot determine codec: %q", key) |
||||
} |
||||
// decode colon
|
||||
if buf, err = advanceAndConsume(buf, ':'); err != nil { |
||||
return nil, nil, err |
||||
} |
||||
// decode value
|
||||
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 { |
||||
return nil, nil, io.ErrShortBuffer |
||||
} |
||||
value, buf, err = fieldCodec.nativeFromTextual(buf) |
||||
if err != nil { |
||||
return nil, nil, err |
||||
} |
||||
// set map value for key
|
||||
mapValues[key] = value |
||||
// either comma or closing curly brace
|
||||
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 { |
||||
return nil, nil, io.ErrShortBuffer |
||||
} |
||||
switch b = buf[0]; b { |
||||
case '}': |
||||
return mapValues, buf[1:], nil |
||||
case ',': |
||||
// no-op
|
||||
default: |
||||
return nil, nil, fmt.Errorf("cannot decode textual map: expected ',' or '}'; received: %q", b) |
||||
} |
||||
// NOTE: consume comma from above
|
||||
if buf, _ = advanceToNonWhitespace(buf[1:]); len(buf) == 0 { |
||||
return nil, nil, io.ErrShortBuffer |
||||
} |
||||
} |
||||
return nil, nil, io.ErrShortBuffer |
||||
} |
||||
|
||||
// genericMapTextEncoder encodes a native Go map to a JSON text blob, using the
|
||||
// codecs from codecFromKey, and if a key is not found in that map, from
|
||||
// defaultCodec if provided. If defaultCodec is nil, this function returns an
|
||||
// error if it encounters a map key that is not present in codecFromKey. If
|
||||
// codecFromKey is nil, every map value will be encoded using defaultCodec, if
|
||||
// possible.
|
||||
func genericMapTextEncoder(buf []byte, datum interface{}, defaultCodec *Codec, codecFromKey map[string]*Codec) ([]byte, error) { |
||||
mapValues, err := convertMap(datum) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot encode textual map: %s", err) |
||||
} |
||||
|
||||
var atLeastOne bool |
||||
|
||||
buf = append(buf, '{') |
||||
|
||||
for key, value := range mapValues { |
||||
atLeastOne = true |
||||
|
||||
// Find a codec for the key
|
||||
fieldCodec := codecFromKey[key] |
||||
if fieldCodec == nil { |
||||
fieldCodec = defaultCodec |
||||
} |
||||
if fieldCodec == nil { |
||||
return nil, fmt.Errorf("cannot encode textual map: cannot determine codec: %q", key) |
||||
} |
||||
// Encode key string
|
||||
buf, err = stringTextualFromNative(buf, key) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
buf = append(buf, ':') |
||||
// Encode value
|
||||
buf, err = fieldCodec.textualFromNative(buf, value) |
||||
if err != nil { |
||||
// field was specified in datum; therefore its value was invalid
|
||||
return nil, fmt.Errorf("cannot encode textual map: value for %q does not match its schema: %s", key, err) |
||||
} |
||||
buf = append(buf, ',') |
||||
} |
||||
|
||||
if atLeastOne { |
||||
return append(buf[:len(buf)-1], '}'), nil |
||||
} |
||||
return append(buf, '}'), nil |
||||
} |
||||
|
||||
// convertMap converts datum to map[string]interface{} if possible.
|
||||
func convertMap(datum interface{}) (map[string]interface{}, error) { |
||||
mapValues, ok := datum.(map[string]interface{}) |
||||
if ok { |
||||
return mapValues, nil |
||||
} |
||||
// NOTE: When given a map of any other type, zip values to items as a
|
||||
// convenience to client.
|
||||
v := reflect.ValueOf(datum) |
||||
if v.Kind() != reflect.Map { |
||||
return nil, fmt.Errorf("cannot create map[string]interface{}: expected map[string]...; received: %T", datum) |
||||
} |
||||
// NOTE: Two better alternatives to the current algorithm are:
|
||||
// (1) mutate the reflection tuple underneath to convert the
|
||||
// map[string]int, for example, to map[string]interface{}, with
|
||||
// O(1) complexity.
|
||||
// (2) use copy builtin to zip the data items over with O(n) complexity,
|
||||
// but more efficient than what's below.
|
||||
mapValues = make(map[string]interface{}, v.Len()) |
||||
for _, key := range v.MapKeys() { |
||||
k, ok := key.Interface().(string) |
||||
if !ok { |
||||
// bail when map key type is not string
|
||||
return nil, fmt.Errorf("cannot create map[string]interface{}: expected map[string]...; received: %T", datum) |
||||
} |
||||
mapValues[string(k)] = v.MapIndex(key).Interface() |
||||
} |
||||
return mapValues, nil |
||||
} |
@ -0,0 +1,143 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"errors" |
||||
"fmt" |
||||
"strings" |
||||
) |
||||
|
||||
const nullNamespace = "" |
||||
|
||||
// ErrInvalidName is the error returned when one or more parts of an Avro name
|
||||
// is invalid.
|
||||
type ErrInvalidName struct { |
||||
Message string |
||||
} |
||||
|
||||
func (e ErrInvalidName) Error() string { |
||||
return "schema name ought to " + e.Message |
||||
} |
||||
|
||||
// NOTE: This function designed to work with name components, after they have
|
||||
// been split on the period rune.
|
||||
func isRuneInvalidForFirstCharacter(r rune) bool { |
||||
return (r < 'A' || r > 'Z') && (r < 'a' || r > 'z') && r != '_' |
||||
} |
||||
|
||||
func isRuneInvalidForOtherCharacters(r rune) bool { |
||||
return isRuneInvalidForFirstCharacter(r) && (r < '0' || r > '9') |
||||
} |
||||
|
||||
func checkNameComponent(s string) error { |
||||
err := checkString(s) |
||||
if err != nil { |
||||
return &ErrInvalidName{err.Error()} |
||||
} |
||||
return err |
||||
} |
||||
|
||||
func checkString(s string) error { |
||||
if len(s) == 0 { |
||||
return errors.New("be non-empty string") |
||||
} |
||||
if strings.IndexFunc(s[:1], isRuneInvalidForFirstCharacter) != -1 { |
||||
return errors.New("start with [A-Za-z_]: " + s) |
||||
} |
||||
if strings.IndexFunc(s[1:], isRuneInvalidForOtherCharacters) != -1 { |
||||
return errors.New("have second and remaining characters contain only [A-Za-z0-9_]: " + s) |
||||
} |
||||
return nil |
||||
} |
||||
|
||||
// name describes an Avro name in terms of its full name and namespace.
|
||||
type name struct { |
||||
fullName string // the instance's Avro name
|
||||
namespace string // for use when building new name from existing one
|
||||
} |
||||
|
||||
// newName returns a new Name instance after first ensuring the arguments do not
|
||||
// violate any of the Avro naming rules.
|
||||
func newName(n, ns, ens string) (*name, error) { |
||||
var nn name |
||||
|
||||
if index := strings.LastIndexByte(n, '.'); index > -1 { |
||||
// inputName does contain a dot, so ignore everything else and use it as the full name
|
||||
nn.fullName = n |
||||
nn.namespace = n[:index] |
||||
} else { |
||||
// inputName does not contain a dot, therefore is not the full name
|
||||
if ns != nullNamespace { |
||||
// if namespace provided in the schema in the same schema level, use it
|
||||
nn.fullName = ns + "." + n |
||||
nn.namespace = ns |
||||
} else if ens != nullNamespace { |
||||
// otherwise if enclosing namespace provided, use it
|
||||
nn.fullName = ens + "." + n |
||||
nn.namespace = ens |
||||
} else { |
||||
// otherwise no namespace, so use null namespace, the empty string
|
||||
nn.fullName = n |
||||
} |
||||
} |
||||
|
||||
// verify all components of the full name for adherence to Avro naming rules
|
||||
for i, component := range strings.Split(nn.fullName, ".") { |
||||
if i == 0 && RelaxedNameValidation && component == "" { |
||||
continue |
||||
} |
||||
if err := checkNameComponent(component); err != nil { |
||||
return nil, err |
||||
} |
||||
} |
||||
|
||||
return &nn, nil |
||||
} |
||||
|
||||
var ( |
||||
// RelaxedNameValidation causes name validation to allow the first component
|
||||
// of an Avro namespace to be the empty string.
|
||||
RelaxedNameValidation bool |
||||
) |
||||
|
||||
func newNameFromSchemaMap(enclosingNamespace string, schemaMap map[string]interface{}) (*name, error) { |
||||
var nameString, namespaceString string |
||||
|
||||
name, ok := schemaMap["name"] |
||||
if !ok { |
||||
return nil, errors.New("schema ought to have name key") |
||||
} |
||||
nameString, ok = name.(string) |
||||
if !ok || nameString == nullNamespace { |
||||
return nil, fmt.Errorf("schema name ought to be non-empty string; received: %T", name) |
||||
} |
||||
namespace, ok := schemaMap["namespace"] |
||||
if ok { |
||||
namespaceString, ok = namespace.(string) |
||||
if !ok || namespaceString == nullNamespace { |
||||
return nil, fmt.Errorf("schema namespace, if provided, ought to be non-empty string; received: %T", namespace) |
||||
} |
||||
} |
||||
|
||||
return newName(nameString, namespaceString, enclosingNamespace) |
||||
} |
||||
|
||||
func (n *name) String() string { |
||||
return n.fullName |
||||
} |
||||
|
||||
// short returns the name without the prefixed namespace.
|
||||
func (n *name) short() string { |
||||
if index := strings.LastIndexByte(n.fullName, '.'); index > -1 { |
||||
return n.fullName[index+1:] |
||||
} |
||||
return n.fullName |
||||
} |
@ -0,0 +1,45 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"bytes" |
||||
"errors" |
||||
"fmt" |
||||
"io" |
||||
) |
||||
|
||||
var nullBytes = []byte("null") |
||||
|
||||
func nullNativeFromBinary(buf []byte) (interface{}, []byte, error) { return nil, buf, nil } |
||||
|
||||
func nullBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
if datum != nil { |
||||
return nil, fmt.Errorf("cannot encode binary null: expected: Go nil; received: %T", datum) |
||||
} |
||||
return buf, nil |
||||
} |
||||
|
||||
func nullNativeFromTextual(buf []byte) (interface{}, []byte, error) { |
||||
if len(buf) < 4 { |
||||
return nil, nil, fmt.Errorf("cannot decode textual null: %s", io.ErrShortBuffer) |
||||
} |
||||
if bytes.Equal(buf[:4], nullBytes) { |
||||
return nil, buf[4:], nil |
||||
} |
||||
return nil, nil, errors.New("cannot decode textual null: expected: null") |
||||
} |
||||
|
||||
func nullTextualFromNative(buf []byte, datum interface{}) ([]byte, error) { |
||||
if datum != nil { |
||||
return nil, fmt.Errorf("cannot encode textual null: expected: Go nil; received: %T", datum) |
||||
} |
||||
return append(buf, nullBytes...), nil |
||||
} |
@ -0,0 +1,240 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"bytes" |
||||
"crypto/rand" |
||||
"errors" |
||||
"fmt" |
||||
"io" |
||||
) |
||||
|
||||
const ( |
||||
// CompressionNullLabel is used when OCF blocks are not compressed.
|
||||
CompressionNullLabel = "null" |
||||
|
||||
// CompressionDeflateLabel is used when OCF blocks are compressed using the
|
||||
// deflate algorithm.
|
||||
CompressionDeflateLabel = "deflate" |
||||
|
||||
// CompressionSnappyLabel is used when OCF blocks are compressed using the
|
||||
// snappy algorithm.
|
||||
CompressionSnappyLabel = "snappy" |
||||
) |
||||
|
||||
// compressionID are values used to specify compression algorithm used to compress
|
||||
// and decompress Avro Object Container File (OCF) streams.
|
||||
type compressionID uint8 |
||||
|
||||
const ( |
||||
compressionNull compressionID = iota |
||||
compressionDeflate |
||||
compressionSnappy |
||||
) |
||||
|
||||
const ( |
||||
ocfBlockConst = 24 // Each OCF block has two longs prefix, and sync marker suffix
|
||||
ocfHeaderSizeConst = 48 // OCF header is usually about 48 bytes longer than its compressed schema
|
||||
ocfMagicString = "Obj\x01" |
||||
ocfMetadataSchema = `{"type":"map","values":"bytes"}` |
||||
ocfSyncLength = 16 |
||||
) |
||||
|
||||
var ( |
||||
ocfMagicBytes = []byte(ocfMagicString) |
||||
ocfMetadataCodec *Codec |
||||
) |
||||
|
||||
func init() { |
||||
ocfMetadataCodec, _ = NewCodec(ocfMetadataSchema) |
||||
} |
||||
|
||||
type ocfHeader struct { |
||||
codec *Codec |
||||
compressionID compressionID |
||||
syncMarker [ocfSyncLength]byte |
||||
metadata map[string][]byte |
||||
} |
||||
|
||||
func newOCFHeader(config OCFConfig) (*ocfHeader, error) { |
||||
var err error |
||||
|
||||
header := new(ocfHeader) |
||||
|
||||
//
|
||||
// avro.codec
|
||||
//
|
||||
switch config.CompressionName { |
||||
case "": |
||||
header.compressionID = compressionNull |
||||
case CompressionNullLabel: |
||||
header.compressionID = compressionNull |
||||
case CompressionDeflateLabel: |
||||
header.compressionID = compressionDeflate |
||||
case CompressionSnappyLabel: |
||||
header.compressionID = compressionSnappy |
||||
default: |
||||
return nil, fmt.Errorf("cannot create OCF header using unrecognized compression algorithm: %q", config.CompressionName) |
||||
} |
||||
|
||||
//
|
||||
// avro.schema
|
||||
//
|
||||
if config.Codec != nil { |
||||
header.codec = config.Codec |
||||
} else if config.Schema == "" { |
||||
return nil, fmt.Errorf("cannot create OCF header without either Codec or Schema specified") |
||||
} else { |
||||
if header.codec, err = NewCodec(config.Schema); err != nil { |
||||
return nil, fmt.Errorf("cannot create OCF header: %s", err) |
||||
} |
||||
} |
||||
|
||||
header.metadata = config.MetaData |
||||
|
||||
//
|
||||
// The 16-byte, randomly-generated sync marker for this file.
|
||||
//
|
||||
_, err = rand.Read(header.syncMarker[:]) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
|
||||
return header, nil |
||||
} |
||||
|
||||
func readOCFHeader(ior io.Reader) (*ocfHeader, error) { |
||||
//
|
||||
// magic bytes
|
||||
//
|
||||
magic := make([]byte, 4) |
||||
_, err := io.ReadFull(ior, magic) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot read OCF header magic bytes: %s", err) |
||||
} |
||||
if !bytes.Equal(magic, ocfMagicBytes) { |
||||
return nil, fmt.Errorf("cannot read OCF header with invalid magic bytes: %#q", magic) |
||||
} |
||||
|
||||
//
|
||||
// metadata
|
||||
//
|
||||
metadata, err := metadataBinaryReader(ior) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot read OCF header metadata: %s", err) |
||||
} |
||||
|
||||
//
|
||||
// avro.codec
|
||||
//
|
||||
// NOTE: Avro specification states that `null` cID is used by
|
||||
// default when "avro.codec" was not included in the metadata header. The
|
||||
// specification does not talk about the case when "avro.codec" was included
|
||||
// with the empty string as its value. I believe it is an error for an OCF
|
||||
// file to provide the empty string as the cID algorithm. While it
|
||||
// is trivially easy to gracefully handle here, I'm not sure whether this
|
||||
// happens a lot, and don't want to accept bad input unless we have
|
||||
// significant reason to do so.
|
||||
var cID compressionID |
||||
value, ok := metadata["avro.codec"] |
||||
if ok { |
||||
switch avroCodec := string(value); avroCodec { |
||||
case CompressionNullLabel: |
||||
cID = compressionNull |
||||
case CompressionDeflateLabel: |
||||
cID = compressionDeflate |
||||
case CompressionSnappyLabel: |
||||
cID = compressionSnappy |
||||
default: |
||||
return nil, fmt.Errorf("cannot read OCF header using unrecognized compression algorithm from avro.codec: %q", avroCodec) |
||||
} |
||||
} |
||||
|
||||
//
|
||||
// create goavro.Codec from specified avro.schema
|
||||
//
|
||||
value, ok = metadata["avro.schema"] |
||||
if !ok { |
||||
return nil, errors.New("cannot read OCF header without avro.schema") |
||||
} |
||||
codec, err := NewCodec(string(value)) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot read OCF header with invalid avro.schema: %s", err) |
||||
} |
||||
|
||||
header := &ocfHeader{codec: codec, compressionID: cID, metadata: metadata} |
||||
|
||||
//
|
||||
// read and store sync marker
|
||||
//
|
||||
if n, err := io.ReadFull(ior, header.syncMarker[:]); err != nil { |
||||
return nil, fmt.Errorf("cannot read OCF header without sync marker: only read %d of %d bytes: %s", n, ocfSyncLength, err) |
||||
} |
||||
|
||||
//
|
||||
// header is valid
|
||||
//
|
||||
return header, nil |
||||
} |
||||
|
||||
func writeOCFHeader(header *ocfHeader, iow io.Writer) (err error) { |
||||
//
|
||||
// avro.codec
|
||||
//
|
||||
var avroCodec string |
||||
switch header.compressionID { |
||||
case compressionNull: |
||||
avroCodec = CompressionNullLabel |
||||
case compressionDeflate: |
||||
avroCodec = CompressionDeflateLabel |
||||
case compressionSnappy: |
||||
avroCodec = CompressionSnappyLabel |
||||
default: |
||||
return fmt.Errorf("should not get here: cannot write OCF header using unrecognized compression algorithm: %d", header.compressionID) |
||||
} |
||||
|
||||
//
|
||||
// avro.schema
|
||||
//
|
||||
// Create buffer for OCF header. The first four bytes are magic, and we'll
|
||||
// use copy to fill them in, so initialize buffer's length with 4, and its
|
||||
// capacity equal to length of avro schema plus a constant.
|
||||
schema := header.codec.Schema() |
||||
buf := make([]byte, 4, len(schema)+ocfHeaderSizeConst) |
||||
_ = copy(buf, ocfMagicBytes) |
||||
|
||||
//
|
||||
// file metadata, including the schema
|
||||
//
|
||||
meta := make(map[string]interface{}) |
||||
for k, v := range header.metadata { |
||||
meta[k] = v |
||||
} |
||||
meta["avro.schema"] = []byte(schema) |
||||
meta["avro.codec"] = []byte(avroCodec) |
||||
|
||||
buf, err = ocfMetadataCodec.BinaryFromNative(buf, meta) |
||||
if err != nil { |
||||
return fmt.Errorf("should not get here: cannot write OCF header: %s", err) |
||||
} |
||||
|
||||
//
|
||||
// 16-byte sync marker
|
||||
//
|
||||
buf = append(buf, header.syncMarker[:]...) |
||||
|
||||
// emit OCF header
|
||||
_, err = iow.Write(buf) |
||||
if err != nil { |
||||
return fmt.Errorf("cannot write OCF header: %s", err) |
||||
} |
||||
return nil |
||||
} |
@ -0,0 +1,263 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"bytes" |
||||
"compress/flate" |
||||
"encoding/binary" |
||||
"errors" |
||||
"fmt" |
||||
"hash/crc32" |
||||
"io" |
||||
"io/ioutil" |
||||
|
||||
"github.com/golang/snappy" |
||||
) |
||||
|
||||
// OCFReader structure is used to read Object Container Files (OCF).
|
||||
type OCFReader struct { |
||||
header *ocfHeader |
||||
block []byte // buffer from which decoding takes place
|
||||
rerr error // most recent error that took place while reading bytes (unrecoverable)
|
||||
ior io.Reader |
||||
readReady bool // true after Scan and before Read
|
||||
remainingBlockItems int64 // count of encoded data items remaining in block buffer to be decoded
|
||||
} |
||||
|
||||
// NewOCFReader initializes and returns a new structure used to read an Avro
|
||||
// Object Container File (OCF).
|
||||
//
|
||||
// func example(ior io.Reader) error {
|
||||
// // NOTE: Wrap provided io.Reader in a buffered reader, which improves the
|
||||
// // performance of streaming file data.
|
||||
// br := bufio.NewReader(ior)
|
||||
// ocfr, err := goavro.NewOCFReader(br)
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
// for ocfr.Scan() {
|
||||
// datum, err := ocfr.Read()
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
// fmt.Println(datum)
|
||||
// }
|
||||
// return ocfr.Err()
|
||||
// }
|
||||
func NewOCFReader(ior io.Reader) (*OCFReader, error) { |
||||
header, err := readOCFHeader(ior) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot create OCFReader: %s", err) |
||||
} |
||||
return &OCFReader{header: header, ior: ior}, nil |
||||
} |
||||
|
||||
//MetaData returns the file metadata map found within the OCF file
|
||||
func (ocfr *OCFReader) MetaData() map[string][]byte { |
||||
return ocfr.header.metadata |
||||
} |
||||
|
||||
// Codec returns the codec found within the OCF file.
|
||||
func (ocfr *OCFReader) Codec() *Codec { |
||||
return ocfr.header.codec |
||||
} |
||||
|
||||
// CompressionName returns the name of the compression algorithm found within
|
||||
// the OCF file.
|
||||
func (ocfr *OCFReader) CompressionName() string { |
||||
switch ocfr.header.compressionID { |
||||
case compressionNull: |
||||
return CompressionNullLabel |
||||
case compressionDeflate: |
||||
return CompressionDeflateLabel |
||||
case compressionSnappy: |
||||
return CompressionSnappyLabel |
||||
default: |
||||
return "should not get here: unrecognized compression algorithm" |
||||
} |
||||
} |
||||
|
||||
// Err returns the last error encountered while reading the OCF file. See
|
||||
// `NewOCFReader` documentation for an example.
|
||||
func (ocfr *OCFReader) Err() error { |
||||
return ocfr.rerr |
||||
} |
||||
|
||||
// Read consumes one datum value from the Avro OCF stream and returns it. Read
|
||||
// is designed to be called only once after each invocation of the Scan method.
|
||||
// See `NewOCFReader` documentation for an example.
|
||||
func (ocfr *OCFReader) Read() (interface{}, error) { |
||||
// NOTE: Test previous error before testing readReady to prevent overwriting
|
||||
// previous error.
|
||||
if ocfr.rerr != nil { |
||||
return nil, ocfr.rerr |
||||
} |
||||
if !ocfr.readReady { |
||||
ocfr.rerr = errors.New("Read called without successful Scan") |
||||
return nil, ocfr.rerr |
||||
} |
||||
ocfr.readReady = false |
||||
|
||||
// decode one datum value from block
|
||||
var datum interface{} |
||||
datum, ocfr.block, ocfr.rerr = ocfr.header.codec.NativeFromBinary(ocfr.block) |
||||
if ocfr.rerr != nil { |
||||
return false, ocfr.rerr |
||||
} |
||||
ocfr.remainingBlockItems-- |
||||
|
||||
return datum, nil |
||||
} |
||||
|
||||
// RemainingBlockItems returns the number of items remaining in the block being
|
||||
// processed.
|
||||
func (ocfr *OCFReader) RemainingBlockItems() int64 { |
||||
return ocfr.remainingBlockItems |
||||
} |
||||
|
||||
// Scan returns true when there is at least one more data item to be read from
|
||||
// the Avro OCF. Scan ought to be called prior to calling the Read method each
|
||||
// time the Read method is invoked. See `NewOCFReader` documentation for an
|
||||
// example.
|
||||
func (ocfr *OCFReader) Scan() bool { |
||||
ocfr.readReady = false |
||||
|
||||
if ocfr.rerr != nil { |
||||
return false |
||||
} |
||||
|
||||
// NOTE: If there are no more remaining data items from the existing block,
|
||||
// then attempt to slurp in the next block.
|
||||
if ocfr.remainingBlockItems <= 0 { |
||||
if count := len(ocfr.block); count != 0 { |
||||
ocfr.rerr = fmt.Errorf("extra bytes between final datum in previous block and block sync marker: %d", count) |
||||
return false |
||||
} |
||||
|
||||
// Read the block count and update the number of remaining items for
|
||||
// this block
|
||||
ocfr.remainingBlockItems, ocfr.rerr = longBinaryReader(ocfr.ior) |
||||
if ocfr.rerr != nil { |
||||
if ocfr.rerr == io.EOF { |
||||
ocfr.rerr = nil // merely end of file, rather than error
|
||||
} else { |
||||
ocfr.rerr = fmt.Errorf("cannot read block count: %s", ocfr.rerr) |
||||
} |
||||
return false |
||||
} |
||||
if ocfr.remainingBlockItems <= 0 { |
||||
ocfr.rerr = fmt.Errorf("cannot decode when block count is not greater than 0: %d", ocfr.remainingBlockItems) |
||||
return false |
||||
} |
||||
if ocfr.remainingBlockItems > MaxBlockCount { |
||||
ocfr.rerr = fmt.Errorf("cannot decode when block count exceeds MaxBlockCount: %d > %d", ocfr.remainingBlockItems, MaxBlockCount) |
||||
} |
||||
|
||||
var blockSize int64 |
||||
blockSize, ocfr.rerr = longBinaryReader(ocfr.ior) |
||||
if ocfr.rerr != nil { |
||||
ocfr.rerr = fmt.Errorf("cannot read block size: %s", ocfr.rerr) |
||||
return false |
||||
} |
||||
if blockSize <= 0 { |
||||
ocfr.rerr = fmt.Errorf("cannot decode when block size is not greater than 0: %d", blockSize) |
||||
return false |
||||
} |
||||
if blockSize > MaxBlockSize { |
||||
ocfr.rerr = fmt.Errorf("cannot decode when block size exceeds MaxBlockSize: %d > %d", blockSize, MaxBlockSize) |
||||
return false |
||||
} |
||||
|
||||
// read entire block into buffer
|
||||
ocfr.block = make([]byte, blockSize) |
||||
_, ocfr.rerr = io.ReadFull(ocfr.ior, ocfr.block) |
||||
if ocfr.rerr != nil { |
||||
ocfr.rerr = fmt.Errorf("cannot read block: %s", ocfr.rerr) |
||||
return false |
||||
} |
||||
|
||||
switch ocfr.header.compressionID { |
||||
case compressionNull: |
||||
// no-op
|
||||
|
||||
case compressionDeflate: |
||||
// NOTE: flate.NewReader wraps with io.ByteReader if argument does
|
||||
// not implement that interface.
|
||||
rc := flate.NewReader(bytes.NewBuffer(ocfr.block)) |
||||
ocfr.block, ocfr.rerr = ioutil.ReadAll(rc) |
||||
if ocfr.rerr != nil { |
||||
_ = rc.Close() |
||||
return false |
||||
} |
||||
if ocfr.rerr = rc.Close(); ocfr.rerr != nil { |
||||
return false |
||||
} |
||||
|
||||
case compressionSnappy: |
||||
index := len(ocfr.block) - 4 // last 4 bytes is crc32 of decoded block
|
||||
if index <= 0 { |
||||
ocfr.rerr = fmt.Errorf("cannot decompress snappy without CRC32 checksum: %d", len(ocfr.block)) |
||||
return false |
||||
} |
||||
decoded, err := snappy.Decode(nil, ocfr.block[:index]) |
||||
if err != nil { |
||||
ocfr.rerr = fmt.Errorf("cannot decompress: %s", err) |
||||
return false |
||||
} |
||||
actualCRC := crc32.ChecksumIEEE(decoded) |
||||
expectedCRC := binary.BigEndian.Uint32(ocfr.block[index : index+4]) |
||||
if actualCRC != expectedCRC { |
||||
ocfr.rerr = fmt.Errorf("snappy CRC32 checksum mismatch: %x != %x", actualCRC, expectedCRC) |
||||
return false |
||||
} |
||||
ocfr.block = decoded |
||||
|
||||
default: |
||||
ocfr.rerr = fmt.Errorf("should not get here: cannot compress block using unrecognized compression: %d", ocfr.header.compressionID) |
||||
return false |
||||
|
||||
} |
||||
|
||||
// read and ensure sync marker matches
|
||||
sync := make([]byte, ocfSyncLength) |
||||
var n int |
||||
if n, ocfr.rerr = io.ReadFull(ocfr.ior, sync); ocfr.rerr != nil { |
||||
ocfr.rerr = fmt.Errorf("cannot read sync marker: read %d out of %d bytes: %s", n, ocfSyncLength, ocfr.rerr) |
||||
return false |
||||
} |
||||
if !bytes.Equal(sync, ocfr.header.syncMarker[:]) { |
||||
ocfr.rerr = fmt.Errorf("sync marker mismatch: %v != %v", sync, ocfr.header.syncMarker) |
||||
return false |
||||
} |
||||
} |
||||
|
||||
ocfr.readReady = true |
||||
return true |
||||
} |
||||
|
||||
// SkipThisBlockAndReset can be called after an error occurs while reading or
|
||||
// decoding datum values from an OCF stream. OCF specifies each OCF stream
|
||||
// contain one or more blocks of data. Each block consists of a block count, the
|
||||
// number of bytes for the block, followed be the possibly compressed
|
||||
// block. Inside each decompressed block is all of the binary encoded datum
|
||||
// values concatenated together. In other words, OCF framing is at a block level
|
||||
// rather than a datum level. If there is an error while reading or decoding a
|
||||
// datum, the reader is not able to skip to the next datum value, because OCF
|
||||
// does not have any markers for where each datum ends and the next one
|
||||
// begins. Therefore, the reader is only able to skip this datum value and all
|
||||
// subsequent datum values in the current block, move to the next block and
|
||||
// start decoding datum values there.
|
||||
func (ocfr *OCFReader) SkipThisBlockAndReset() { |
||||
// ??? is it an error to call method unless the reader has had an error
|
||||
ocfr.remainingBlockItems = 0 |
||||
ocfr.block = ocfr.block[:0] |
||||
ocfr.rerr = nil |
||||
} |
@ -0,0 +1,253 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"bytes" |
||||
"compress/flate" |
||||
"encoding/binary" |
||||
"errors" |
||||
"fmt" |
||||
"hash/crc32" |
||||
"io" |
||||
"io/ioutil" |
||||
"os" |
||||
|
||||
"github.com/golang/snappy" |
||||
) |
||||
|
||||
// OCFConfig is used to specify creation parameters for OCFWriter.
|
||||
type OCFConfig struct { |
||||
// W specifies the `io.Writer` to which to send the encoded data,
|
||||
// (required). If W is `*os.File`, then creating an OCF for writing will
|
||||
// attempt to read any existing OCF header and use the schema and
|
||||
// compression codec specified by the existing header, then advance the file
|
||||
// position to the tail end of the file for appending.
|
||||
W io.Writer |
||||
|
||||
// Codec specifies the Codec to use for the new OCFWriter, (optional). If
|
||||
// the W parameter above is an `*os.File` which contains a Codec, the Codec
|
||||
// in the existing file will be used instead. Otherwise if this Codec
|
||||
// parameter is specified, it will be used. If neither the W parameter above
|
||||
// is an `*os.File` with an existing Codec, nor this Codec parameter is
|
||||
// specified, the OCFWriter will create a new Codec from the schema string
|
||||
// specified by the Schema parameter below.
|
||||
Codec *Codec |
||||
|
||||
// Schema specifies the Avro schema for the data to be encoded, (optional).
|
||||
// If neither the W parameter above is an `*os.File` with an existing Codec,
|
||||
// nor the Codec parameter above is specified, the OCFWriter will create a
|
||||
// new Codec from the schema string specified by this Schema parameter.
|
||||
Schema string |
||||
|
||||
// CompressionName specifies the compression codec used, (optional). If
|
||||
// omitted, defaults to "null" codec. When appending to an existing OCF,
|
||||
// this field is ignored.
|
||||
CompressionName string |
||||
|
||||
//MetaData specifies application specific meta data to be added to
|
||||
//the OCF file. When appending to an existing OCF, this field
|
||||
//is ignored
|
||||
MetaData map[string][]byte |
||||
} |
||||
|
||||
// OCFWriter is used to create a new or append to an existing Avro Object
|
||||
// Container File (OCF).
|
||||
type OCFWriter struct { |
||||
header *ocfHeader |
||||
iow io.Writer |
||||
} |
||||
|
||||
// NewOCFWriter returns a new OCFWriter instance that may be used for appending
|
||||
// binary Avro data, either by appending to an existing OCF file or creating a
|
||||
// new OCF file.
|
||||
func NewOCFWriter(config OCFConfig) (*OCFWriter, error) { |
||||
var err error |
||||
ocf := &OCFWriter{iow: config.W} |
||||
|
||||
switch config.W.(type) { |
||||
case nil: |
||||
return nil, errors.New("cannot create OCFWriter when W is nil") |
||||
case *os.File: |
||||
file := config.W.(*os.File) |
||||
stat, err := file.Stat() |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot create OCFWriter: %s", err) |
||||
} |
||||
// NOTE: When upstream provides a new file, it will already exist but
|
||||
// have a size of 0 bytes.
|
||||
if stat.Size() > 0 { |
||||
// attempt to read existing OCF header
|
||||
if ocf.header, err = readOCFHeader(file); err != nil { |
||||
return nil, fmt.Errorf("cannot create OCFWriter: %s", err) |
||||
} |
||||
// prepare for appending data to existing OCF
|
||||
if err = ocf.quickScanToTail(file); err != nil { |
||||
return nil, fmt.Errorf("cannot create OCFWriter: %s", err) |
||||
} |
||||
return ocf, nil // happy case for appending to existing OCF
|
||||
} |
||||
} |
||||
|
||||
// create new OCF header based on configuration parameters
|
||||
if ocf.header, err = newOCFHeader(config); err != nil { |
||||
return nil, fmt.Errorf("cannot create OCFWriter: %s", err) |
||||
} |
||||
if err = writeOCFHeader(ocf.header, config.W); err != nil { |
||||
return nil, fmt.Errorf("cannot create OCFWriter: %s", err) |
||||
} |
||||
return ocf, nil // another happy case for creation of new OCF
|
||||
} |
||||
|
||||
// quickScanToTail advances the stream reader to the tail end of the
|
||||
// file. Rather than reading each encoded block, optionally decompressing it,
|
||||
// and then decoding it, this method reads the block count, ignoring it, then
|
||||
// reads the block size, then skips ahead to the followig block. It does this
|
||||
// repeatedly until attempts to read the file return io.EOF.
|
||||
func (ocfw *OCFWriter) quickScanToTail(ior io.Reader) error { |
||||
sync := make([]byte, ocfSyncLength) |
||||
for { |
||||
// Read and validate block count
|
||||
blockCount, err := longBinaryReader(ior) |
||||
if err != nil { |
||||
if err == io.EOF { |
||||
return nil // merely end of file, rather than error
|
||||
} |
||||
return fmt.Errorf("cannot read block count: %s", err) |
||||
} |
||||
if blockCount <= 0 { |
||||
return fmt.Errorf("cannot read when block count is not greater than 0: %d", blockCount) |
||||
} |
||||
if blockCount > MaxBlockCount { |
||||
return fmt.Errorf("cannot read when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount) |
||||
} |
||||
// Read block size
|
||||
blockSize, err := longBinaryReader(ior) |
||||
if err != nil { |
||||
return fmt.Errorf("cannot read block size: %s", err) |
||||
} |
||||
if blockSize <= 0 { |
||||
return fmt.Errorf("cannot read when block size is not greater than 0: %d", blockSize) |
||||
} |
||||
if blockSize > MaxBlockSize { |
||||
return fmt.Errorf("cannot read when block size exceeds MaxBlockSize: %d > %d", blockSize, MaxBlockSize) |
||||
} |
||||
// Advance reader to end of block
|
||||
if _, err = io.CopyN(ioutil.Discard, ior, blockSize); err != nil { |
||||
return fmt.Errorf("cannot seek to next block: %s", err) |
||||
} |
||||
// Read and validate sync marker
|
||||
var n int |
||||
if n, err = io.ReadFull(ior, sync); err != nil { |
||||
return fmt.Errorf("cannot read sync marker: read %d out of %d bytes: %s", n, ocfSyncLength, err) |
||||
} |
||||
if !bytes.Equal(sync, ocfw.header.syncMarker[:]) { |
||||
return fmt.Errorf("sync marker mismatch: %v != %v", sync, ocfw.header.syncMarker) |
||||
} |
||||
} |
||||
} |
||||
|
||||
// Append appends one or more data items to an OCF file in a block. If there are
|
||||
// more data items in the slice than MaxBlockCount allows, the data slice will
|
||||
// be chunked into multiple blocks, each not having more than MaxBlockCount
|
||||
// items.
|
||||
func (ocfw *OCFWriter) Append(data interface{}) error { |
||||
arrayValues, err := convertArray(data) |
||||
if err != nil { |
||||
return err |
||||
} |
||||
|
||||
// Chunk data so no block has more than MaxBlockCount items.
|
||||
for int64(len(arrayValues)) > MaxBlockCount { |
||||
if err := ocfw.appendDataIntoBlock(arrayValues[:MaxBlockCount]); err != nil { |
||||
return err |
||||
} |
||||
arrayValues = arrayValues[MaxBlockCount:] |
||||
} |
||||
return ocfw.appendDataIntoBlock(arrayValues) |
||||
} |
||||
|
||||
func (ocfw *OCFWriter) appendDataIntoBlock(data []interface{}) error { |
||||
var block []byte // working buffer for encoding data values
|
||||
var err error |
||||
|
||||
// Encode and concatenate each data item into the block
|
||||
for _, datum := range data { |
||||
if block, err = ocfw.header.codec.BinaryFromNative(block, datum); err != nil { |
||||
return fmt.Errorf("cannot translate datum to binary: %v; %s", datum, err) |
||||
} |
||||
} |
||||
|
||||
switch ocfw.header.compressionID { |
||||
case compressionNull: |
||||
// no-op
|
||||
|
||||
case compressionDeflate: |
||||
// compress into new bytes buffer.
|
||||
bb := bytes.NewBuffer(make([]byte, 0, len(block))) |
||||
|
||||
cw, _ := flate.NewWriter(bb, flate.DefaultCompression) |
||||
// writing bytes to cw will compress bytes and send to bb.
|
||||
if _, err := cw.Write(block); err != nil { |
||||
return err |
||||
} |
||||
if err := cw.Close(); err != nil { |
||||
return err |
||||
} |
||||
block = bb.Bytes() |
||||
|
||||
case compressionSnappy: |
||||
compressed := snappy.Encode(nil, block) |
||||
|
||||
// OCF requires snappy to have CRC32 checksum after each snappy block
|
||||
compressed = append(compressed, 0, 0, 0, 0) // expand slice by 4 bytes so checksum will fit
|
||||
binary.BigEndian.PutUint32(compressed[len(compressed)-4:], crc32.ChecksumIEEE(block)) // checksum of decompressed block
|
||||
|
||||
block = compressed |
||||
|
||||
default: |
||||
return fmt.Errorf("should not get here: cannot compress block using unrecognized compression: %d", ocfw.header.compressionID) |
||||
|
||||
} |
||||
|
||||
// create file data block
|
||||
buf := make([]byte, 0, len(block)+ocfBlockConst) // pre-allocate block bytes
|
||||
buf, _ = longBinaryFromNative(buf, len(data)) // block count (number of data items)
|
||||
buf, _ = longBinaryFromNative(buf, len(block)) // block size (number of bytes in block)
|
||||
buf = append(buf, block...) // serialized objects
|
||||
buf = append(buf, ocfw.header.syncMarker[:]...) // sync marker
|
||||
|
||||
_, err = ocfw.iow.Write(buf) |
||||
return err |
||||
} |
||||
|
||||
// Codec returns the codec used by OCFWriter. This function provided because
|
||||
// upstream may be appending to existing OCF which uses a different schema than
|
||||
// requested during instantiation.
|
||||
func (ocfw *OCFWriter) Codec() *Codec { |
||||
return ocfw.header.codec |
||||
} |
||||
|
||||
// CompressionName returns the name of the compression algorithm used by
|
||||
// OCFWriter. This function provided because upstream may be appending to
|
||||
// existing OCF which uses a different compression algorithm than requested
|
||||
// during instantiation. the OCF file.
|
||||
func (ocfw *OCFWriter) CompressionName() string { |
||||
switch ocfw.header.compressionID { |
||||
case compressionNull: |
||||
return CompressionNullLabel |
||||
case compressionDeflate: |
||||
return CompressionDeflateLabel |
||||
case compressionSnappy: |
||||
return CompressionSnappyLabel |
||||
default: |
||||
return "should not get here: unrecognized compression algorithm" |
||||
} |
||||
} |
@ -0,0 +1,185 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"fmt" |
||||
) |
||||
|
||||
func makeRecordCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) { |
||||
// NOTE: To support recursive data types, create the codec and register it
|
||||
// using the specified name, and fill in the codec functions later.
|
||||
c, err := registerNewCodec(st, schemaMap, enclosingNamespace) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("Record ought to have valid name: %s", err) |
||||
} |
||||
|
||||
fields, ok := schemaMap["fields"] |
||||
if !ok { |
||||
return nil, fmt.Errorf("Record %q ought to have fields key", c.typeName) |
||||
} |
||||
fieldSchemas, ok := fields.([]interface{}) |
||||
if !ok || len(fieldSchemas) == 0 { |
||||
return nil, fmt.Errorf("Record %q fields ought to be non-empty array: %v", c.typeName, fields) |
||||
} |
||||
|
||||
codecFromFieldName := make(map[string]*Codec) |
||||
codecFromIndex := make([]*Codec, len(fieldSchemas)) |
||||
nameFromIndex := make([]string, len(fieldSchemas)) |
||||
defaultValueFromName := make(map[string]interface{}, len(fieldSchemas)) |
||||
|
||||
for i, fieldSchema := range fieldSchemas { |
||||
fieldSchemaMap, ok := fieldSchema.(map[string]interface{}) |
||||
if !ok { |
||||
return nil, fmt.Errorf("Record %q field %d ought to be valid Avro named type; received: %v", c.typeName, i+1, fieldSchema) |
||||
} |
||||
|
||||
// NOTE: field names are not registered in the symbol table, because
|
||||
// field names are not individually addressable codecs.
|
||||
|
||||
fieldCodec, err := buildCodecForTypeDescribedByMap(st, c.typeName.namespace, fieldSchemaMap) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("Record %q field %d ought to be valid Avro named type: %s", c.typeName, i+1, err) |
||||
} |
||||
|
||||
// However, when creating a full name for the field name, be sure to use
|
||||
// record's namespace
|
||||
n, err := newNameFromSchemaMap(c.typeName.namespace, fieldSchemaMap) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("Record %q field %d ought to have valid name: %v", c.typeName, i+1, fieldSchemaMap) |
||||
} |
||||
fieldName := n.short() |
||||
if _, ok := codecFromFieldName[fieldName]; ok { |
||||
return nil, fmt.Errorf("Record %q field %d ought to have unique name: %q", c.typeName, i+1, fieldName) |
||||
} |
||||
|
||||
if defaultValue, ok := fieldSchemaMap["default"]; ok { |
||||
// if codec is union, then default value ought to encode using first schema in union
|
||||
if fieldCodec.typeName.short() == "union" { |
||||
// NOTE: To support a null default value,
|
||||
// the string literal "null" must be coerced to a `nil`
|
||||
if defaultValue == "null" { |
||||
defaultValue = nil |
||||
} |
||||
// NOTE: To support record field default values, union schema
|
||||
// set to the type name of first member
|
||||
defaultValue = Union(fieldCodec.schema, defaultValue) |
||||
} |
||||
// attempt to encode default value using codec
|
||||
_, err = fieldCodec.binaryFromNative(nil, defaultValue) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err) |
||||
} |
||||
defaultValueFromName[fieldName] = defaultValue |
||||
} |
||||
|
||||
nameFromIndex[i] = fieldName |
||||
codecFromIndex[i] = fieldCodec |
||||
codecFromFieldName[fieldName] = fieldCodec |
||||
} |
||||
|
||||
c.binaryFromNative = func(buf []byte, datum interface{}) ([]byte, error) { |
||||
valueMap, ok := datum.(map[string]interface{}) |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode binary record %q: expected map[string]interface{}; received: %T", c.typeName, datum) |
||||
} |
||||
|
||||
// records encoded in order fields were defined in schema
|
||||
for i, fieldCodec := range codecFromIndex { |
||||
fieldName := nameFromIndex[i] |
||||
|
||||
// NOTE: If field value was not specified in map, then set
|
||||
// fieldValue to its default value (which may or may not have been
|
||||
// specified).
|
||||
fieldValue, ok := valueMap[fieldName] |
||||
if !ok { |
||||
if fieldValue, ok = defaultValueFromName[fieldName]; !ok { |
||||
return nil, fmt.Errorf("cannot encode binary record %q field %q: schema does not specify default value and no value provided", c.typeName, fieldName) |
||||
} |
||||
} |
||||
|
||||
var err error |
||||
buf, err = fieldCodec.binaryFromNative(buf, fieldValue) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot encode binary record %q field %q: value does not match its schema: %s", c.typeName, fieldName, err) |
||||
} |
||||
} |
||||
return buf, nil |
||||
} |
||||
|
||||
c.nativeFromBinary = func(buf []byte) (interface{}, []byte, error) { |
||||
recordMap := make(map[string]interface{}, len(codecFromIndex)) |
||||
for i, fieldCodec := range codecFromIndex { |
||||
name := nameFromIndex[i] |
||||
var value interface{} |
||||
var err error |
||||
value, buf, err = fieldCodec.nativeFromBinary(buf) |
||||
if err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary record %q field %q: %s", c.typeName, name, err) |
||||
} |
||||
recordMap[name] = value |
||||
} |
||||
return recordMap, buf, nil |
||||
} |
||||
|
||||
c.nativeFromTextual = func(buf []byte) (interface{}, []byte, error) { |
||||
var mapValues map[string]interface{} |
||||
var err error |
||||
// NOTE: Setting `defaultCodec == nil` instructs genericMapTextDecoder
|
||||
// to return an error when a field name is not found in the
|
||||
// codecFromFieldName map.
|
||||
mapValues, buf, err = genericMapTextDecoder(buf, nil, codecFromFieldName) |
||||
if err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode textual record %q: %s", c.typeName, err) |
||||
} |
||||
if actual, expected := len(mapValues), len(codecFromFieldName); actual != expected { |
||||
// set missing field keys to their respective default values, then
|
||||
// re-check number of keys
|
||||
for fieldName, defaultValue := range defaultValueFromName { |
||||
if _, ok := mapValues[fieldName]; !ok { |
||||
mapValues[fieldName] = defaultValue |
||||
} |
||||
} |
||||
if actual, expected = len(mapValues), len(codecFromFieldName); actual != expected { |
||||
return nil, nil, fmt.Errorf("cannot decode textual record %q: only found %d of %d fields", c.typeName, actual, expected) |
||||
} |
||||
} |
||||
return mapValues, buf, nil |
||||
} |
||||
|
||||
c.textualFromNative = func(buf []byte, datum interface{}) ([]byte, error) { |
||||
// NOTE: Ensure only schema defined field names are encoded; and if
|
||||
// missing in datum, either use the provided field default value or
|
||||
// return an error.
|
||||
sourceMap, ok := datum.(map[string]interface{}) |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode textual record %q: expected map[string]interface{}; received: %T", c.typeName, datum) |
||||
} |
||||
destMap := make(map[string]interface{}, len(codecFromIndex)) |
||||
for fieldName := range codecFromFieldName { |
||||
fieldValue, ok := sourceMap[fieldName] |
||||
if !ok { |
||||
defaultValue, ok := defaultValueFromName[fieldName] |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode textual record %q field %q: schema does not specify default value and no value provided", c.typeName, fieldName) |
||||
} |
||||
fieldValue = defaultValue |
||||
} |
||||
destMap[fieldName] = fieldValue |
||||
} |
||||
datum = destMap |
||||
// NOTE: Setting `defaultCodec == nil` instructs genericMapTextEncoder
|
||||
// to return an error when a field name is not found in the
|
||||
// codecFromFieldName map.
|
||||
return genericMapTextEncoder(buf, datum, nil, codecFromFieldName) |
||||
} |
||||
|
||||
return c, nil |
||||
} |
@ -0,0 +1,41 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"fmt" |
||||
"io" |
||||
"unicode" |
||||
) |
||||
|
||||
// advanceAndConsume advances to non whitespace and returns an error if the next
|
||||
// non whitespace byte is not what is expected.
|
||||
func advanceAndConsume(buf []byte, expected byte) ([]byte, error) { |
||||
var err error |
||||
if buf, err = advanceToNonWhitespace(buf); err != nil { |
||||
return nil, err |
||||
} |
||||
if actual := buf[0]; actual != expected { |
||||
return nil, fmt.Errorf("expected: %q; actual: %q", expected, actual) |
||||
} |
||||
return buf[1:], nil |
||||
} |
||||
|
||||
// advanceToNonWhitespace consumes bytes from buf until non-whitespace character
|
||||
// is found. It returns error when no more bytes remain, because its purpose is
|
||||
// to scan ahead to the next non-whitespace character.
|
||||
func advanceToNonWhitespace(buf []byte) ([]byte, error) { |
||||
for i, b := range buf { |
||||
if !unicode.IsSpace(rune(b)) { |
||||
return buf[i:], nil |
||||
} |
||||
} |
||||
return nil, io.ErrShortBuffer |
||||
} |
@ -0,0 +1,178 @@ |
||||
// Copyright [2017] LinkedIn Corp. Licensed under the Apache License, Version
|
||||
// 2.0 (the "License"); you may not use this file except in compliance with the
|
||||
// License. You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
package goavro |
||||
|
||||
import ( |
||||
"bytes" |
||||
"errors" |
||||
"fmt" |
||||
) |
||||
|
||||
// Union wraps a datum value in a map for encoding as a Union, as required by
|
||||
// Union encoder.
|
||||
//
|
||||
// When providing a value for an Avro union, the encoder will accept `nil` for a
|
||||
// `null` value. If the value is non-`nil`, it must be a
|
||||
// `map[string]interface{}` with a single key-value pair, where the key is the
|
||||
// Avro type name and the value is the datum's value. As a convenience, the
|
||||
// `Union` function wraps any datum value in a map as specified above.
|
||||
//
|
||||
// func ExampleUnion() {
|
||||
// codec, err := goavro.NewCodec(`["null","string","int"]`)
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// }
|
||||
// buf, err := codec.TextFromNative(nil, goavro.Union("string", "some string"))
|
||||
// if err != nil {
|
||||
// fmt.Println(err)
|
||||
// }
|
||||
// fmt.Println(string(buf))
|
||||
// // Output: {"string":"some string"}
|
||||
// }
|
||||
func Union(name string, datum interface{}) interface{} { |
||||
if datum == nil && name == "null" { |
||||
return nil |
||||
} |
||||
return map[string]interface{}{name: datum} |
||||
} |
||||
|
||||
func buildCodecForTypeDescribedBySlice(st map[string]*Codec, enclosingNamespace string, schemaArray []interface{}) (*Codec, error) { |
||||
if len(schemaArray) == 0 { |
||||
return nil, errors.New("Union ought to have one or more members") |
||||
} |
||||
|
||||
allowedTypes := make([]string, len(schemaArray)) // used for error reporting when encoder receives invalid datum type
|
||||
codecFromIndex := make([]*Codec, len(schemaArray)) |
||||
codecFromName := make(map[string]*Codec, len(schemaArray)) |
||||
indexFromName := make(map[string]int, len(schemaArray)) |
||||
|
||||
for i, unionMemberSchema := range schemaArray { |
||||
unionMemberCodec, err := buildCodec(st, enclosingNamespace, unionMemberSchema) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("Union item %d ought to be valid Avro type: %s", i+1, err) |
||||
} |
||||
fullName := unionMemberCodec.typeName.fullName |
||||
if _, ok := indexFromName[fullName]; ok { |
||||
return nil, fmt.Errorf("Union item %d ought to be unique type: %s", i+1, unionMemberCodec.typeName) |
||||
} |
||||
allowedTypes[i] = fullName |
||||
codecFromIndex[i] = unionMemberCodec |
||||
codecFromName[fullName] = unionMemberCodec |
||||
indexFromName[fullName] = i |
||||
} |
||||
|
||||
return &Codec{ |
||||
// NOTE: To support record field default values, union schema set to the
|
||||
// type name of first member
|
||||
schema: codecFromIndex[0].typeName.short(), |
||||
|
||||
typeName: &name{"union", nullNamespace}, |
||||
nativeFromBinary: func(buf []byte) (interface{}, []byte, error) { |
||||
var decoded interface{} |
||||
var err error |
||||
|
||||
decoded, buf, err = longNativeFromBinary(buf) |
||||
if err != nil { |
||||
return nil, nil, err |
||||
} |
||||
index := decoded.(int64) // longDecoder always returns int64, so elide error checking
|
||||
if index < 0 || index >= int64(len(codecFromIndex)) { |
||||
return nil, nil, fmt.Errorf("cannot decode binary union: index ought to be between 0 and %d; read index: %d", len(codecFromIndex)-1, index) |
||||
} |
||||
c := codecFromIndex[index] |
||||
decoded, buf, err = c.nativeFromBinary(buf) |
||||
if err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode binary union item %d: %s", index+1, err) |
||||
} |
||||
if decoded == nil { |
||||
// do not wrap a nil value in a map
|
||||
return nil, buf, nil |
||||
} |
||||
// Non-nil values are wrapped in a map with single key set to type name of value
|
||||
return Union(allowedTypes[index], decoded), buf, nil |
||||
}, |
||||
binaryFromNative: func(buf []byte, datum interface{}) ([]byte, error) { |
||||
switch v := datum.(type) { |
||||
case nil: |
||||
index, ok := indexFromName["null"] |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode binary union: no member schema types support datum: allowed types: %v; received: %T", allowedTypes, datum) |
||||
} |
||||
return longBinaryFromNative(buf, index) |
||||
case map[string]interface{}: |
||||
if len(v) != 1 { |
||||
return nil, fmt.Errorf("cannot encode binary union: non-nil Union values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", allowedTypes, datum) |
||||
} |
||||
// will execute exactly once
|
||||
for key, value := range v { |
||||
index, ok := indexFromName[key] |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode binary union: no member schema types support datum: allowed types: %v; received: %T", allowedTypes, datum) |
||||
} |
||||
c := codecFromIndex[index] |
||||
buf, _ = longBinaryFromNative(buf, index) |
||||
return c.binaryFromNative(buf, value) |
||||
} |
||||
} |
||||
return nil, fmt.Errorf("cannot encode binary union: non-nil Union values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", allowedTypes, datum) |
||||
}, |
||||
nativeFromTextual: func(buf []byte) (interface{}, []byte, error) { |
||||
if len(buf) >= 4 && bytes.Equal(buf[:4], []byte("null")) { |
||||
if _, ok := indexFromName["null"]; ok { |
||||
return nil, buf[4:], nil |
||||
} |
||||
} |
||||
|
||||
var datum interface{} |
||||
var err error |
||||
datum, buf, err = genericMapTextDecoder(buf, nil, codecFromName) |
||||
if err != nil { |
||||
return nil, nil, fmt.Errorf("cannot decode textual union: %s", err) |
||||
} |
||||
|
||||
return datum, buf, nil |
||||
}, |
||||
textualFromNative: func(buf []byte, datum interface{}) ([]byte, error) { |
||||
switch v := datum.(type) { |
||||
case nil: |
||||
_, ok := indexFromName["null"] |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode textual union: no member schema types support datum: allowed types: %v; received: %T", allowedTypes, datum) |
||||
} |
||||
return append(buf, "null"...), nil |
||||
case map[string]interface{}: |
||||
if len(v) != 1 { |
||||
return nil, fmt.Errorf("cannot encode textual union: non-nil Union values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", allowedTypes, datum) |
||||
} |
||||
// will execute exactly once
|
||||
for key, value := range v { |
||||
index, ok := indexFromName[key] |
||||
if !ok { |
||||
return nil, fmt.Errorf("cannot encode textual union: no member schema types support datum: allowed types: %v; received: %T", allowedTypes, datum) |
||||
} |
||||
buf = append(buf, '{') |
||||
var err error |
||||
buf, err = stringTextualFromNative(buf, key) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot encode textual union: %s", err) |
||||
} |
||||
buf = append(buf, ':') |
||||
c := codecFromIndex[index] |
||||
buf, err = c.textualFromNative(buf, value) |
||||
if err != nil { |
||||
return nil, fmt.Errorf("cannot encode textual union: %s", err) |
||||
} |
||||
return append(buf, '}'), nil |
||||
} |
||||
} |
||||
return nil, fmt.Errorf("cannot encode textual union: non-nil values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", allowedTypes, datum) |
||||
}, |
||||
}, nil |
||||
} |
Loading…
Reference in new issue