mirror of https://github.com/grafana/loki
Promtail: Add text encoding conversion to file targets (#6395)
Adds support for text encoding conversion for file targets. To use it, add `encoding: <encoding_name>` into the scrapeconfig.pull/6618/head
parent
2f192bbb25
commit
f17d3d768c
@ -0,0 +1,249 @@ |
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run maketables.go
|
||||
|
||||
// Package charmap provides simple character encodings such as IBM Code Page 437
|
||||
// and Windows 1252.
|
||||
package charmap // import "golang.org/x/text/encoding/charmap"
|
||||
|
||||
import ( |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/encoding" |
||||
"golang.org/x/text/encoding/internal" |
||||
"golang.org/x/text/encoding/internal/identifier" |
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// These encodings vary only in the way clients should interpret them. Their
|
||||
// coded character set is identical and a single implementation can be shared.
|
||||
var ( |
||||
// ISO8859_6E is the ISO 8859-6E encoding.
|
||||
ISO8859_6E encoding.Encoding = &iso8859_6E |
||||
|
||||
// ISO8859_6I is the ISO 8859-6I encoding.
|
||||
ISO8859_6I encoding.Encoding = &iso8859_6I |
||||
|
||||
// ISO8859_8E is the ISO 8859-8E encoding.
|
||||
ISO8859_8E encoding.Encoding = &iso8859_8E |
||||
|
||||
// ISO8859_8I is the ISO 8859-8I encoding.
|
||||
ISO8859_8I encoding.Encoding = &iso8859_8I |
||||
|
||||
iso8859_6E = internal.Encoding{ |
||||
Encoding: ISO8859_6, |
||||
Name: "ISO-8859-6E", |
||||
MIB: identifier.ISO88596E, |
||||
} |
||||
|
||||
iso8859_6I = internal.Encoding{ |
||||
Encoding: ISO8859_6, |
||||
Name: "ISO-8859-6I", |
||||
MIB: identifier.ISO88596I, |
||||
} |
||||
|
||||
iso8859_8E = internal.Encoding{ |
||||
Encoding: ISO8859_8, |
||||
Name: "ISO-8859-8E", |
||||
MIB: identifier.ISO88598E, |
||||
} |
||||
|
||||
iso8859_8I = internal.Encoding{ |
||||
Encoding: ISO8859_8, |
||||
Name: "ISO-8859-8I", |
||||
MIB: identifier.ISO88598I, |
||||
} |
||||
) |
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All []encoding.Encoding = listAll |
||||
|
||||
// TODO: implement these encodings, in order of importance.
|
||||
// ASCII, ISO8859_1: Rather common. Close to Windows 1252.
|
||||
// ISO8859_9: Close to Windows 1254.
|
||||
|
||||
// utf8Enc holds a rune's UTF-8 encoding in data[:len].
|
||||
type utf8Enc struct { |
||||
len uint8 |
||||
data [3]byte |
||||
} |
||||
|
||||
// Charmap is an 8-bit character set encoding.
|
||||
type Charmap struct { |
||||
// name is the encoding's name.
|
||||
name string |
||||
// mib is the encoding type of this encoder.
|
||||
mib identifier.MIB |
||||
// asciiSuperset states whether the encoding is a superset of ASCII.
|
||||
asciiSuperset bool |
||||
// low is the lower bound of the encoded byte for a non-ASCII rune. If
|
||||
// Charmap.asciiSuperset is true then this will be 0x80, otherwise 0x00.
|
||||
low uint8 |
||||
// replacement is the encoded replacement character.
|
||||
replacement byte |
||||
// decode is the map from encoded byte to UTF-8.
|
||||
decode [256]utf8Enc |
||||
// encoding is the map from runes to encoded bytes. Each entry is a
|
||||
// uint32: the high 8 bits are the encoded byte and the low 24 bits are
|
||||
// the rune. The table entries are sorted by ascending rune.
|
||||
encode [256]uint32 |
||||
} |
||||
|
||||
// NewDecoder implements the encoding.Encoding interface.
|
||||
func (m *Charmap) NewDecoder() *encoding.Decoder { |
||||
return &encoding.Decoder{Transformer: charmapDecoder{charmap: m}} |
||||
} |
||||
|
||||
// NewEncoder implements the encoding.Encoding interface.
|
||||
func (m *Charmap) NewEncoder() *encoding.Encoder { |
||||
return &encoding.Encoder{Transformer: charmapEncoder{charmap: m}} |
||||
} |
||||
|
||||
// String returns the Charmap's name.
|
||||
func (m *Charmap) String() string { |
||||
return m.name |
||||
} |
||||
|
||||
// ID implements an internal interface.
|
||||
func (m *Charmap) ID() (mib identifier.MIB, other string) { |
||||
return m.mib, "" |
||||
} |
||||
|
||||
// charmapDecoder implements transform.Transformer by decoding to UTF-8.
|
||||
type charmapDecoder struct { |
||||
transform.NopResetter |
||||
charmap *Charmap |
||||
} |
||||
|
||||
func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
for i, c := range src { |
||||
if m.charmap.asciiSuperset && c < utf8.RuneSelf { |
||||
if nDst >= len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = c |
||||
nDst++ |
||||
nSrc = i + 1 |
||||
continue |
||||
} |
||||
|
||||
decode := &m.charmap.decode[c] |
||||
n := int(decode.len) |
||||
if nDst+n > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
// It's 15% faster to avoid calling copy for these tiny slices.
|
||||
for j := 0; j < n; j++ { |
||||
dst[nDst] = decode.data[j] |
||||
nDst++ |
||||
} |
||||
nSrc = i + 1 |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
// DecodeByte returns the Charmap's rune decoding of the byte b.
|
||||
func (m *Charmap) DecodeByte(b byte) rune { |
||||
switch x := &m.decode[b]; x.len { |
||||
case 1: |
||||
return rune(x.data[0]) |
||||
case 2: |
||||
return rune(x.data[0]&0x1f)<<6 | rune(x.data[1]&0x3f) |
||||
default: |
||||
return rune(x.data[0]&0x0f)<<12 | rune(x.data[1]&0x3f)<<6 | rune(x.data[2]&0x3f) |
||||
} |
||||
} |
||||
|
||||
// charmapEncoder implements transform.Transformer by encoding from UTF-8.
|
||||
type charmapEncoder struct { |
||||
transform.NopResetter |
||||
charmap *Charmap |
||||
} |
||||
|
||||
func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
loop: |
||||
for nSrc < len(src) { |
||||
if nDst >= len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
r = rune(src[nSrc]) |
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf { |
||||
if m.charmap.asciiSuperset { |
||||
nSrc++ |
||||
dst[nDst] = uint8(r) |
||||
nDst++ |
||||
continue |
||||
} |
||||
size = 1 |
||||
|
||||
} else { |
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:]) |
||||
if size == 1 { |
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
||||
err = transform.ErrShortSrc |
||||
} else { |
||||
err = internal.RepertoireError(m.charmap.replacement) |
||||
} |
||||
break |
||||
} |
||||
} |
||||
|
||||
// Binary search in [low, high) for that rune in the m.charmap.encode table.
|
||||
for low, high := int(m.charmap.low), 0x100; ; { |
||||
if low >= high { |
||||
err = internal.RepertoireError(m.charmap.replacement) |
||||
break loop |
||||
} |
||||
mid := (low + high) / 2 |
||||
got := m.charmap.encode[mid] |
||||
gotRune := rune(got & (1<<24 - 1)) |
||||
if gotRune < r { |
||||
low = mid + 1 |
||||
} else if gotRune > r { |
||||
high = mid |
||||
} else { |
||||
dst[nDst] = byte(got >> 24) |
||||
nDst++ |
||||
break |
||||
} |
||||
} |
||||
nSrc += size |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
// EncodeRune returns the Charmap's byte encoding of the rune r. ok is whether
|
||||
// r is in the Charmap's repertoire. If not, b is set to the Charmap's
|
||||
// replacement byte. This is often the ASCII substitute character '\x1a'.
|
||||
func (m *Charmap) EncodeRune(r rune) (b byte, ok bool) { |
||||
if r < utf8.RuneSelf && m.asciiSuperset { |
||||
return byte(r), true |
||||
} |
||||
for low, high := int(m.low), 0x100; ; { |
||||
if low >= high { |
||||
return m.replacement, false |
||||
} |
||||
mid := (low + high) / 2 |
||||
got := m.encode[mid] |
||||
gotRune := rune(got & (1<<24 - 1)) |
||||
if gotRune < r { |
||||
low = mid + 1 |
||||
} else if gotRune > r { |
||||
high = mid |
||||
} else { |
||||
return byte(got >> 24), true |
||||
} |
||||
} |
||||
} |
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,335 @@ |
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package encoding defines an interface for character encodings, such as Shift
|
||||
// JIS and Windows 1252, that can convert to and from UTF-8.
|
||||
//
|
||||
// Encoding implementations are provided in other packages, such as
|
||||
// golang.org/x/text/encoding/charmap and
|
||||
// golang.org/x/text/encoding/japanese.
|
||||
package encoding // import "golang.org/x/text/encoding"
|
||||
|
||||
import ( |
||||
"errors" |
||||
"io" |
||||
"strconv" |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/encoding/internal/identifier" |
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// TODO:
|
||||
// - There seems to be some inconsistency in when decoders return errors
|
||||
// and when not. Also documentation seems to suggest they shouldn't return
|
||||
// errors at all (except for UTF-16).
|
||||
// - Encoders seem to rely on or at least benefit from the input being in NFC
|
||||
// normal form. Perhaps add an example how users could prepare their output.
|
||||
|
||||
// Encoding is a character set encoding that can be transformed to and from
|
||||
// UTF-8.
|
||||
type Encoding interface { |
||||
// NewDecoder returns a Decoder.
|
||||
NewDecoder() *Decoder |
||||
|
||||
// NewEncoder returns an Encoder.
|
||||
NewEncoder() *Encoder |
||||
} |
||||
|
||||
// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
|
||||
//
|
||||
// Transforming source bytes that are not of that encoding will not result in an
|
||||
// error per se. Each byte that cannot be transcoded will be represented in the
|
||||
// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
|
||||
type Decoder struct { |
||||
transform.Transformer |
||||
|
||||
// This forces external creators of Decoders to use names in struct
|
||||
// initializers, allowing for future extendibility without having to break
|
||||
// code.
|
||||
_ struct{} |
||||
} |
||||
|
||||
// Bytes converts the given encoded bytes to UTF-8. It returns the converted
|
||||
// bytes or nil, err if any error occurred.
|
||||
func (d *Decoder) Bytes(b []byte) ([]byte, error) { |
||||
b, _, err := transform.Bytes(d, b) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
return b, nil |
||||
} |
||||
|
||||
// String converts the given encoded string to UTF-8. It returns the converted
|
||||
// string or "", err if any error occurred.
|
||||
func (d *Decoder) String(s string) (string, error) { |
||||
s, _, err := transform.String(d, s) |
||||
if err != nil { |
||||
return "", err |
||||
} |
||||
return s, nil |
||||
} |
||||
|
||||
// Reader wraps another Reader to decode its bytes.
|
||||
//
|
||||
// The Decoder may not be used for any other operation as long as the returned
|
||||
// Reader is in use.
|
||||
func (d *Decoder) Reader(r io.Reader) io.Reader { |
||||
return transform.NewReader(r, d) |
||||
} |
||||
|
||||
// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
|
||||
//
|
||||
// Each rune that cannot be transcoded will result in an error. In this case,
|
||||
// the transform will consume all source byte up to, not including the offending
|
||||
// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
|
||||
// `\uFFFD`. To return early with an error instead, use transform.Chain to
|
||||
// preprocess the data with a UTF8Validator.
|
||||
type Encoder struct { |
||||
transform.Transformer |
||||
|
||||
// This forces external creators of Encoders to use names in struct
|
||||
// initializers, allowing for future extendibility without having to break
|
||||
// code.
|
||||
_ struct{} |
||||
} |
||||
|
||||
// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
|
||||
// any error occurred.
|
||||
func (e *Encoder) Bytes(b []byte) ([]byte, error) { |
||||
b, _, err := transform.Bytes(e, b) |
||||
if err != nil { |
||||
return nil, err |
||||
} |
||||
return b, nil |
||||
} |
||||
|
||||
// String converts a string from UTF-8. It returns the converted string or
|
||||
// "", err if any error occurred.
|
||||
func (e *Encoder) String(s string) (string, error) { |
||||
s, _, err := transform.String(e, s) |
||||
if err != nil { |
||||
return "", err |
||||
} |
||||
return s, nil |
||||
} |
||||
|
||||
// Writer wraps another Writer to encode its UTF-8 output.
|
||||
//
|
||||
// The Encoder may not be used for any other operation as long as the returned
|
||||
// Writer is in use.
|
||||
func (e *Encoder) Writer(w io.Writer) io.Writer { |
||||
return transform.NewWriter(w, e) |
||||
} |
||||
|
||||
// ASCIISub is the ASCII substitute character, as recommended by
|
||||
// https://unicode.org/reports/tr36/#Text_Comparison
|
||||
const ASCIISub = '\x1a' |
||||
|
||||
// Nop is the nop encoding. Its transformed bytes are the same as the source
|
||||
// bytes; it does not replace invalid UTF-8 sequences.
|
||||
var Nop Encoding = nop{} |
||||
|
||||
type nop struct{} |
||||
|
||||
func (nop) NewDecoder() *Decoder { |
||||
return &Decoder{Transformer: transform.Nop} |
||||
} |
||||
func (nop) NewEncoder() *Encoder { |
||||
return &Encoder{Transformer: transform.Nop} |
||||
} |
||||
|
||||
// Replacement is the replacement encoding. Decoding from the replacement
|
||||
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
|
||||
// the replacement encoding yields the same as the source bytes except that
|
||||
// invalid UTF-8 is converted to '\uFFFD'.
|
||||
//
|
||||
// It is defined at http://encoding.spec.whatwg.org/#replacement
|
||||
var Replacement Encoding = replacement{} |
||||
|
||||
type replacement struct{} |
||||
|
||||
func (replacement) NewDecoder() *Decoder { |
||||
return &Decoder{Transformer: replacementDecoder{}} |
||||
} |
||||
|
||||
func (replacement) NewEncoder() *Encoder { |
||||
return &Encoder{Transformer: replacementEncoder{}} |
||||
} |
||||
|
||||
func (replacement) ID() (mib identifier.MIB, other string) { |
||||
return identifier.Replacement, "" |
||||
} |
||||
|
||||
type replacementDecoder struct{ transform.NopResetter } |
||||
|
||||
func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
if len(dst) < 3 { |
||||
return 0, 0, transform.ErrShortDst |
||||
} |
||||
if atEOF { |
||||
const fffd = "\ufffd" |
||||
dst[0] = fffd[0] |
||||
dst[1] = fffd[1] |
||||
dst[2] = fffd[2] |
||||
nDst = 3 |
||||
} |
||||
return nDst, len(src), nil |
||||
} |
||||
|
||||
type replacementEncoder struct{ transform.NopResetter } |
||||
|
||||
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
|
||||
for ; nSrc < len(src); nSrc += size { |
||||
r = rune(src[nSrc]) |
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf { |
||||
size = 1 |
||||
|
||||
} else { |
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:]) |
||||
if size == 1 { |
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
r = '\ufffd' |
||||
} |
||||
} |
||||
|
||||
if nDst+utf8.RuneLen(r) > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
nDst += utf8.EncodeRune(dst[nDst:], r) |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
|
||||
// repertoire of the destination encoding with HTML escape sequences.
|
||||
//
|
||||
// This wrapper exists to comply to URL and HTML forms requiring a
|
||||
// non-terminating legacy encoder. The produced sequences may lead to data
|
||||
// loss as they are indistinguishable from legitimate input. To avoid this
|
||||
// issue, use UTF-8 encodings whenever possible.
|
||||
func HTMLEscapeUnsupported(e *Encoder) *Encoder { |
||||
return &Encoder{Transformer: &errorHandler{e, errorToHTML}} |
||||
} |
||||
|
||||
// ReplaceUnsupported wraps encoders to replace source runes outside the
|
||||
// repertoire of the destination encoding with an encoding-specific
|
||||
// replacement.
|
||||
//
|
||||
// This wrapper is only provided for backwards compatibility and legacy
|
||||
// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
|
||||
func ReplaceUnsupported(e *Encoder) *Encoder { |
||||
return &Encoder{Transformer: &errorHandler{e, errorToReplacement}} |
||||
} |
||||
|
||||
type errorHandler struct { |
||||
*Encoder |
||||
handler func(dst []byte, r rune, err repertoireError) (n int, ok bool) |
||||
} |
||||
|
||||
// TODO: consider making this error public in some form.
|
||||
type repertoireError interface { |
||||
Replacement() byte |
||||
} |
||||
|
||||
func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF) |
||||
for err != nil { |
||||
rerr, ok := err.(repertoireError) |
||||
if !ok { |
||||
return nDst, nSrc, err |
||||
} |
||||
r, sz := utf8.DecodeRune(src[nSrc:]) |
||||
n, ok := h.handler(dst[nDst:], r, rerr) |
||||
if !ok { |
||||
return nDst, nSrc, transform.ErrShortDst |
||||
} |
||||
err = nil |
||||
nDst += n |
||||
if nSrc += sz; nSrc < len(src) { |
||||
var dn, sn int |
||||
dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF) |
||||
nDst += dn |
||||
nSrc += sn |
||||
} |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) { |
||||
buf := [8]byte{} |
||||
b := strconv.AppendUint(buf[:0], uint64(r), 10) |
||||
if n = len(b) + len("&#;"); n >= len(dst) { |
||||
return 0, false |
||||
} |
||||
dst[0] = '&' |
||||
dst[1] = '#' |
||||
dst[copy(dst[2:], b)+2] = ';' |
||||
return n, true |
||||
} |
||||
|
||||
func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) { |
||||
if len(dst) == 0 { |
||||
return 0, false |
||||
} |
||||
dst[0] = err.Replacement() |
||||
return 1, true |
||||
} |
||||
|
||||
// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
|
||||
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8") |
||||
|
||||
// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
|
||||
// input byte that is not valid UTF-8.
|
||||
var UTF8Validator transform.Transformer = utf8Validator{} |
||||
|
||||
type utf8Validator struct{ transform.NopResetter } |
||||
|
||||
func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
n := len(src) |
||||
if n > len(dst) { |
||||
n = len(dst) |
||||
} |
||||
for i := 0; i < n; { |
||||
if c := src[i]; c < utf8.RuneSelf { |
||||
dst[i] = c |
||||
i++ |
||||
continue |
||||
} |
||||
_, size := utf8.DecodeRune(src[i:]) |
||||
if size == 1 { |
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
err = ErrInvalidUTF8 |
||||
if !atEOF && !utf8.FullRune(src[i:]) { |
||||
err = transform.ErrShortSrc |
||||
} |
||||
return i, i, err |
||||
} |
||||
if i+size > len(dst) { |
||||
return i, i, transform.ErrShortDst |
||||
} |
||||
for ; size > 0; size-- { |
||||
dst[i] = src[i] |
||||
i++ |
||||
} |
||||
} |
||||
if len(src) > len(dst) { |
||||
err = transform.ErrShortDst |
||||
} |
||||
return n, n, err |
||||
} |
||||
@ -0,0 +1,74 @@ |
||||
// Copyright 2019 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package ianaindex |
||||
|
||||
import ( |
||||
"unicode" |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/encoding" |
||||
"golang.org/x/text/encoding/internal" |
||||
"golang.org/x/text/encoding/internal/identifier" |
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
type asciiDecoder struct { |
||||
transform.NopResetter |
||||
} |
||||
|
||||
func (d asciiDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
for _, c := range src { |
||||
if c > unicode.MaxASCII { |
||||
r := unicode.ReplacementChar |
||||
if nDst+utf8.RuneLen(r) > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
nDst += utf8.EncodeRune(dst[nDst:], r) |
||||
nSrc++ |
||||
continue |
||||
} |
||||
|
||||
if nDst >= len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = c |
||||
nDst++ |
||||
nSrc++ |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
type asciiEncoder struct { |
||||
transform.NopResetter |
||||
} |
||||
|
||||
func (d asciiEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
for _, c := range src { |
||||
if c > unicode.MaxASCII { |
||||
err = internal.RepertoireError(encoding.ASCIISub) |
||||
break |
||||
} |
||||
|
||||
if nDst >= len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = c |
||||
nDst++ |
||||
nSrc++ |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
var asciiEnc = &internal.Encoding{ |
||||
Encoding: &internal.SimpleEncoding{ |
||||
asciiDecoder{}, |
||||
asciiEncoder{}, |
||||
}, |
||||
Name: "US-ASCII", |
||||
MIB: identifier.ASCII, |
||||
} |
||||
@ -0,0 +1,214 @@ |
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run gen.go
|
||||
|
||||
// Package ianaindex maps names to Encodings as specified by the IANA registry.
|
||||
// This includes both the MIME and IANA names.
|
||||
//
|
||||
// See http://www.iana.org/assignments/character-sets/character-sets.xhtml for
|
||||
// more details.
|
||||
package ianaindex |
||||
|
||||
import ( |
||||
"errors" |
||||
"sort" |
||||
"strings" |
||||
|
||||
"golang.org/x/text/encoding" |
||||
"golang.org/x/text/encoding/charmap" |
||||
"golang.org/x/text/encoding/internal/identifier" |
||||
"golang.org/x/text/encoding/japanese" |
||||
"golang.org/x/text/encoding/korean" |
||||
"golang.org/x/text/encoding/simplifiedchinese" |
||||
"golang.org/x/text/encoding/traditionalchinese" |
||||
"golang.org/x/text/encoding/unicode" |
||||
) |
||||
|
||||
// TODO: remove the "Status... incomplete" in the package doc comment.
|
||||
// TODO: allow users to specify their own aliases?
|
||||
// TODO: allow users to specify their own indexes?
|
||||
// TODO: allow canonicalizing names
|
||||
|
||||
// NOTE: only use these top-level variables if we can get the linker to drop
|
||||
// the indexes when they are not used. Make them a function or perhaps only
|
||||
// support MIME otherwise.
|
||||
|
||||
var ( |
||||
// MIME is an index to map MIME names.
|
||||
MIME *Index = mime |
||||
|
||||
// IANA is an index that supports all names and aliases using IANA names as
|
||||
// the canonical identifier.
|
||||
IANA *Index = iana |
||||
|
||||
// MIB is an index that associates the MIB display name with an Encoding.
|
||||
MIB *Index = mib |
||||
|
||||
mime = &Index{mimeName, ianaToMIB, ianaAliases, encodings[:]} |
||||
iana = &Index{ianaName, ianaToMIB, ianaAliases, encodings[:]} |
||||
mib = &Index{mibName, ianaToMIB, ianaAliases, encodings[:]} |
||||
) |
||||
|
||||
// Index maps names registered by IANA to Encodings.
|
||||
// Currently different Indexes only differ in the names they return for
|
||||
// encodings. In the future they may also differ in supported aliases.
|
||||
type Index struct { |
||||
names func(i int) string |
||||
toMIB []identifier.MIB // Sorted slice of supported MIBs
|
||||
alias map[string]int |
||||
enc []encoding.Encoding |
||||
} |
||||
|
||||
var ( |
||||
errInvalidName = errors.New("ianaindex: invalid encoding name") |
||||
errUnknown = errors.New("ianaindex: unknown Encoding") |
||||
errUnsupported = errors.New("ianaindex: unsupported Encoding") |
||||
) |
||||
|
||||
// Encoding returns an Encoding for IANA-registered names. Matching is
|
||||
// case-insensitive.
|
||||
//
|
||||
// If the provided name doesn't match a IANA-registered charset, an error is
|
||||
// returned. If the name matches a IANA-registered charset but isn't supported,
|
||||
// a nil encoding and a nil error are returned.
|
||||
func (x *Index) Encoding(name string) (encoding.Encoding, error) { |
||||
name = strings.TrimSpace(name) |
||||
// First try without lowercasing (possibly creating an allocation).
|
||||
i, ok := x.alias[name] |
||||
if !ok { |
||||
i, ok = x.alias[strings.ToLower(name)] |
||||
if !ok { |
||||
return nil, errInvalidName |
||||
} |
||||
} |
||||
return x.enc[i], nil |
||||
} |
||||
|
||||
// Name reports the canonical name of the given Encoding. It will return an
|
||||
// error if the e is not associated with a known encoding scheme.
|
||||
func (x *Index) Name(e encoding.Encoding) (string, error) { |
||||
id, ok := e.(identifier.Interface) |
||||
if !ok { |
||||
return "", errUnknown |
||||
} |
||||
mib, _ := id.ID() |
||||
if mib == 0 { |
||||
return "", errUnknown |
||||
} |
||||
v := findMIB(x.toMIB, mib) |
||||
if v == -1 { |
||||
return "", errUnsupported |
||||
} |
||||
return x.names(v), nil |
||||
} |
||||
|
||||
// TODO: the coverage of this index is rather spotty. Allowing users to set
|
||||
// encodings would allow:
|
||||
// - users to increase coverage
|
||||
// - allow a partially loaded set of encodings in case the user doesn't need to
|
||||
// them all.
|
||||
// - write an OS-specific wrapper for supported encodings and set them.
|
||||
// The exact definition of Set depends a bit on if and how we want to let users
|
||||
// write their own Encoding implementations. Also, it is not possible yet to
|
||||
// only partially load the encodings without doing some refactoring. Until this
|
||||
// is solved, we might as well not support Set.
|
||||
// // Set sets the e to be used for the encoding scheme identified by name. Only
|
||||
// // canonical names may be used. An empty name assigns e to its internally
|
||||
// // associated encoding scheme.
|
||||
// func (x *Index) Set(name string, e encoding.Encoding) error {
|
||||
// panic("TODO: implement")
|
||||
// }
|
||||
|
||||
func findMIB(x []identifier.MIB, mib identifier.MIB) int { |
||||
i := sort.Search(len(x), func(i int) bool { return x[i] >= mib }) |
||||
if i < len(x) && x[i] == mib { |
||||
return i |
||||
} |
||||
return -1 |
||||
} |
||||
|
||||
const maxMIMENameLen = '0' - 1 // officially 40, but we leave some buffer.
|
||||
|
||||
func mimeName(x int) string { |
||||
n := ianaNames[x] |
||||
// See gen.go for a description of the encoding.
|
||||
if n[0] <= maxMIMENameLen { |
||||
return n[1:n[0]] |
||||
} |
||||
return n |
||||
} |
||||
|
||||
func ianaName(x int) string { |
||||
n := ianaNames[x] |
||||
// See gen.go for a description of the encoding.
|
||||
if n[0] <= maxMIMENameLen { |
||||
return n[n[0]:] |
||||
} |
||||
return n |
||||
} |
||||
|
||||
func mibName(x int) string { |
||||
return mibNames[x] |
||||
} |
||||
|
||||
var encodings = [numIANA]encoding.Encoding{ |
||||
enc3: asciiEnc, |
||||
enc106: unicode.UTF8, |
||||
enc1015: unicode.UTF16(unicode.BigEndian, unicode.UseBOM), |
||||
enc1013: unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM), |
||||
enc1014: unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM), |
||||
enc2028: charmap.CodePage037, |
||||
enc2011: charmap.CodePage437, |
||||
enc2009: charmap.CodePage850, |
||||
enc2010: charmap.CodePage852, |
||||
enc2046: charmap.CodePage855, |
||||
enc2089: charmap.CodePage858, |
||||
enc2048: charmap.CodePage860, |
||||
enc2013: charmap.CodePage862, |
||||
enc2050: charmap.CodePage863, |
||||
enc2052: charmap.CodePage865, |
||||
enc2086: charmap.CodePage866, |
||||
enc2102: charmap.CodePage1047, |
||||
enc2091: charmap.CodePage1140, |
||||
enc4: charmap.ISO8859_1, |
||||
enc5: charmap.ISO8859_2, |
||||
enc6: charmap.ISO8859_3, |
||||
enc7: charmap.ISO8859_4, |
||||
enc8: charmap.ISO8859_5, |
||||
enc9: charmap.ISO8859_6, |
||||
enc81: charmap.ISO8859_6E, |
||||
enc82: charmap.ISO8859_6I, |
||||
enc10: charmap.ISO8859_7, |
||||
enc11: charmap.ISO8859_8, |
||||
enc84: charmap.ISO8859_8E, |
||||
enc85: charmap.ISO8859_8I, |
||||
enc12: charmap.ISO8859_9, |
||||
enc13: charmap.ISO8859_10, |
||||
enc109: charmap.ISO8859_13, |
||||
enc110: charmap.ISO8859_14, |
||||
enc111: charmap.ISO8859_15, |
||||
enc112: charmap.ISO8859_16, |
||||
enc2084: charmap.KOI8R, |
||||
enc2088: charmap.KOI8U, |
||||
enc2027: charmap.Macintosh, |
||||
enc2109: charmap.Windows874, |
||||
enc2250: charmap.Windows1250, |
||||
enc2251: charmap.Windows1251, |
||||
enc2252: charmap.Windows1252, |
||||
enc2253: charmap.Windows1253, |
||||
enc2254: charmap.Windows1254, |
||||
enc2255: charmap.Windows1255, |
||||
enc2256: charmap.Windows1256, |
||||
enc2257: charmap.Windows1257, |
||||
enc2258: charmap.Windows1258, |
||||
enc18: japanese.EUCJP, |
||||
enc39: japanese.ISO2022JP, |
||||
enc17: japanese.ShiftJIS, |
||||
enc38: korean.EUCKR, |
||||
enc114: simplifiedchinese.GB18030, |
||||
enc113: simplifiedchinese.GBK, |
||||
enc2085: simplifiedchinese.HZGB2312, |
||||
enc2026: traditionalchinese.Big5, |
||||
} |
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,81 @@ |
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run gen.go
|
||||
|
||||
// Package identifier defines the contract between implementations of Encoding
|
||||
// and Index by defining identifiers that uniquely identify standardized coded
|
||||
// character sets (CCS) and character encoding schemes (CES), which we will
|
||||
// together refer to as encodings, for which Encoding implementations provide
|
||||
// converters to and from UTF-8. This package is typically only of concern to
|
||||
// implementers of Indexes and Encodings.
|
||||
//
|
||||
// One part of the identifier is the MIB code, which is defined by IANA and
|
||||
// uniquely identifies a CCS or CES. Each code is associated with data that
|
||||
// references authorities, official documentation as well as aliases and MIME
|
||||
// names.
|
||||
//
|
||||
// Not all CESs are covered by the IANA registry. The "other" string that is
|
||||
// returned by ID can be used to identify other character sets or versions of
|
||||
// existing ones.
|
||||
//
|
||||
// It is recommended that each package that provides a set of Encodings provide
|
||||
// the All and Common variables to reference all supported encodings and
|
||||
// commonly used subset. This allows Index implementations to include all
|
||||
// available encodings without explicitly referencing or knowing about them.
|
||||
package identifier |
||||
|
||||
// Note: this package is internal, but could be made public if there is a need
|
||||
// for writing third-party Indexes and Encodings.
|
||||
|
||||
// References:
|
||||
// - http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/convrtrs.txt
|
||||
// - http://www.iana.org/assignments/character-sets/character-sets.xhtml
|
||||
// - http://www.iana.org/assignments/ianacharset-mib/ianacharset-mib
|
||||
// - http://www.ietf.org/rfc/rfc2978.txt
|
||||
// - https://www.unicode.org/reports/tr22/
|
||||
// - http://www.w3.org/TR/encoding/
|
||||
// - https://encoding.spec.whatwg.org/
|
||||
// - https://encoding.spec.whatwg.org/encodings.json
|
||||
// - https://tools.ietf.org/html/rfc6657#section-5
|
||||
|
||||
// Interface can be implemented by Encodings to define the CCS or CES for which
|
||||
// it implements conversions.
|
||||
type Interface interface { |
||||
// ID returns an encoding identifier. Exactly one of the mib and other
|
||||
// values should be non-zero.
|
||||
//
|
||||
// In the usual case it is only necessary to indicate the MIB code. The
|
||||
// other string can be used to specify encodings for which there is no MIB,
|
||||
// such as "x-mac-dingbat".
|
||||
//
|
||||
// The other string may only contain the characters a-z, A-Z, 0-9, - and _.
|
||||
ID() (mib MIB, other string) |
||||
|
||||
// NOTE: the restrictions on the encoding are to allow extending the syntax
|
||||
// with additional information such as versions, vendors and other variants.
|
||||
} |
||||
|
||||
// A MIB identifies an encoding. It is derived from the IANA MIB codes and adds
|
||||
// some identifiers for some encodings that are not covered by the IANA
|
||||
// standard.
|
||||
//
|
||||
// See http://www.iana.org/assignments/ianacharset-mib.
|
||||
type MIB uint16 |
||||
|
||||
// These additional MIB types are not defined in IANA. They are added because
|
||||
// they are common and defined within the text repo.
|
||||
const ( |
||||
// Unofficial marks the start of encodings not registered by IANA.
|
||||
Unofficial MIB = 10000 + iota |
||||
|
||||
// Replacement is the WhatWG replacement encoding.
|
||||
Replacement |
||||
|
||||
// XUserDefined is the code for x-user-defined.
|
||||
XUserDefined |
||||
|
||||
// MacintoshCyrillic is the code for x-mac-cyrillic.
|
||||
MacintoshCyrillic |
||||
) |
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,75 @@ |
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package internal contains code that is shared among encoding implementations.
|
||||
package internal |
||||
|
||||
import ( |
||||
"golang.org/x/text/encoding" |
||||
"golang.org/x/text/encoding/internal/identifier" |
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// Encoding is an implementation of the Encoding interface that adds the String
|
||||
// and ID methods to an existing encoding.
|
||||
type Encoding struct { |
||||
encoding.Encoding |
||||
Name string |
||||
MIB identifier.MIB |
||||
} |
||||
|
||||
// _ verifies that Encoding implements identifier.Interface.
|
||||
var _ identifier.Interface = (*Encoding)(nil) |
||||
|
||||
func (e *Encoding) String() string { |
||||
return e.Name |
||||
} |
||||
|
||||
func (e *Encoding) ID() (mib identifier.MIB, other string) { |
||||
return e.MIB, "" |
||||
} |
||||
|
||||
// SimpleEncoding is an Encoding that combines two Transformers.
|
||||
type SimpleEncoding struct { |
||||
Decoder transform.Transformer |
||||
Encoder transform.Transformer |
||||
} |
||||
|
||||
func (e *SimpleEncoding) NewDecoder() *encoding.Decoder { |
||||
return &encoding.Decoder{Transformer: e.Decoder} |
||||
} |
||||
|
||||
func (e *SimpleEncoding) NewEncoder() *encoding.Encoder { |
||||
return &encoding.Encoder{Transformer: e.Encoder} |
||||
} |
||||
|
||||
// FuncEncoding is an Encoding that combines two functions returning a new
|
||||
// Transformer.
|
||||
type FuncEncoding struct { |
||||
Decoder func() transform.Transformer |
||||
Encoder func() transform.Transformer |
||||
} |
||||
|
||||
func (e FuncEncoding) NewDecoder() *encoding.Decoder { |
||||
return &encoding.Decoder{Transformer: e.Decoder()} |
||||
} |
||||
|
||||
func (e FuncEncoding) NewEncoder() *encoding.Encoder { |
||||
return &encoding.Encoder{Transformer: e.Encoder()} |
||||
} |
||||
|
||||
// A RepertoireError indicates a rune is not in the repertoire of a destination
|
||||
// encoding. It is associated with an encoding-specific suggested replacement
|
||||
// byte.
|
||||
type RepertoireError byte |
||||
|
||||
// Error implements the error interrface.
|
||||
func (r RepertoireError) Error() string { |
||||
return "encoding: rune not supported by encoding." |
||||
} |
||||
|
||||
// Replacement returns the replacement string associated with this error.
|
||||
func (r RepertoireError) Replacement() byte { return byte(r) } |
||||
|
||||
var ErrASCIIReplacement = RepertoireError(encoding.ASCIISub) |
||||
@ -0,0 +1,12 @@ |
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package japanese |
||||
|
||||
import ( |
||||
"golang.org/x/text/encoding" |
||||
) |
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All = []encoding.Encoding{EUCJP, ISO2022JP, ShiftJIS} |
||||
@ -0,0 +1,225 @@ |
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package japanese |
||||
|
||||
import ( |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/encoding" |
||||
"golang.org/x/text/encoding/internal" |
||||
"golang.org/x/text/encoding/internal/identifier" |
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// EUCJP is the EUC-JP encoding.
|
||||
var EUCJP encoding.Encoding = &eucJP |
||||
|
||||
var eucJP = internal.Encoding{ |
||||
&internal.SimpleEncoding{eucJPDecoder{}, eucJPEncoder{}}, |
||||
"EUC-JP", |
||||
identifier.EUCPkdFmtJapanese, |
||||
} |
||||
|
||||
type eucJPDecoder struct{ transform.NopResetter } |
||||
|
||||
// See https://encoding.spec.whatwg.org/#euc-jp-decoder.
|
||||
func (eucJPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
loop: |
||||
for ; nSrc < len(src); nSrc += size { |
||||
switch c0 := src[nSrc]; { |
||||
case c0 < utf8.RuneSelf: |
||||
r, size = rune(c0), 1 |
||||
|
||||
case c0 == 0x8e: |
||||
if nSrc+1 >= len(src) { |
||||
if !atEOF { |
||||
err = transform.ErrShortSrc |
||||
break loop |
||||
} |
||||
r, size = utf8.RuneError, 1 |
||||
break |
||||
} |
||||
c1 := src[nSrc+1] |
||||
switch { |
||||
case c1 < 0xa1: |
||||
r, size = utf8.RuneError, 1 |
||||
case c1 > 0xdf: |
||||
r, size = utf8.RuneError, 2 |
||||
if c1 == 0xff { |
||||
size = 1 |
||||
} |
||||
default: |
||||
r, size = rune(c1)+(0xff61-0xa1), 2 |
||||
} |
||||
case c0 == 0x8f: |
||||
if nSrc+2 >= len(src) { |
||||
if !atEOF { |
||||
err = transform.ErrShortSrc |
||||
break loop |
||||
} |
||||
r, size = utf8.RuneError, 1 |
||||
if p := nSrc + 1; p < len(src) && 0xa1 <= src[p] && src[p] < 0xfe { |
||||
size = 2 |
||||
} |
||||
break |
||||
} |
||||
c1 := src[nSrc+1] |
||||
if c1 < 0xa1 || 0xfe < c1 { |
||||
r, size = utf8.RuneError, 1 |
||||
break |
||||
} |
||||
c2 := src[nSrc+2] |
||||
if c2 < 0xa1 || 0xfe < c2 { |
||||
r, size = utf8.RuneError, 2 |
||||
break |
||||
} |
||||
r, size = utf8.RuneError, 3 |
||||
if i := int(c1-0xa1)*94 + int(c2-0xa1); i < len(jis0212Decode) { |
||||
r = rune(jis0212Decode[i]) |
||||
if r == 0 { |
||||
r = utf8.RuneError |
||||
} |
||||
} |
||||
|
||||
case 0xa1 <= c0 && c0 <= 0xfe: |
||||
if nSrc+1 >= len(src) { |
||||
if !atEOF { |
||||
err = transform.ErrShortSrc |
||||
break loop |
||||
} |
||||
r, size = utf8.RuneError, 1 |
||||
break |
||||
} |
||||
c1 := src[nSrc+1] |
||||
if c1 < 0xa1 || 0xfe < c1 { |
||||
r, size = utf8.RuneError, 1 |
||||
break |
||||
} |
||||
r, size = utf8.RuneError, 2 |
||||
if i := int(c0-0xa1)*94 + int(c1-0xa1); i < len(jis0208Decode) { |
||||
r = rune(jis0208Decode[i]) |
||||
if r == 0 { |
||||
r = utf8.RuneError |
||||
} |
||||
} |
||||
|
||||
default: |
||||
r, size = utf8.RuneError, 1 |
||||
} |
||||
|
||||
if nDst+utf8.RuneLen(r) > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break loop |
||||
} |
||||
nDst += utf8.EncodeRune(dst[nDst:], r) |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
type eucJPEncoder struct{ transform.NopResetter } |
||||
|
||||
func (eucJPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
for ; nSrc < len(src); nSrc += size { |
||||
r = rune(src[nSrc]) |
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf { |
||||
size = 1 |
||||
|
||||
} else { |
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:]) |
||||
if size == 1 { |
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
} |
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch { |
||||
case encode0Low <= r && r < encode0High: |
||||
if r = rune(encode0[r-encode0Low]); r != 0 { |
||||
goto write2or3 |
||||
} |
||||
case encode1Low <= r && r < encode1High: |
||||
if r = rune(encode1[r-encode1Low]); r != 0 { |
||||
goto write2or3 |
||||
} |
||||
case encode2Low <= r && r < encode2High: |
||||
if r = rune(encode2[r-encode2Low]); r != 0 { |
||||
goto write2or3 |
||||
} |
||||
case encode3Low <= r && r < encode3High: |
||||
if r = rune(encode3[r-encode3Low]); r != 0 { |
||||
goto write2or3 |
||||
} |
||||
case encode4Low <= r && r < encode4High: |
||||
if r = rune(encode4[r-encode4Low]); r != 0 { |
||||
goto write2or3 |
||||
} |
||||
case encode5Low <= r && r < encode5High: |
||||
if 0xff61 <= r && r < 0xffa0 { |
||||
goto write2 |
||||
} |
||||
if r = rune(encode5[r-encode5Low]); r != 0 { |
||||
goto write2or3 |
||||
} |
||||
} |
||||
err = internal.ErrASCIIReplacement |
||||
break |
||||
} |
||||
|
||||
if nDst >= len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = uint8(r) |
||||
nDst++ |
||||
continue |
||||
|
||||
write2or3: |
||||
if r>>tableShift == jis0208 { |
||||
if nDst+2 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
} else { |
||||
if nDst+3 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = 0x8f |
||||
nDst++ |
||||
} |
||||
dst[nDst+0] = 0xa1 + uint8(r>>codeShift)&codeMask |
||||
dst[nDst+1] = 0xa1 + uint8(r)&codeMask |
||||
nDst += 2 |
||||
continue |
||||
|
||||
write2: |
||||
if nDst+2 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst+0] = 0x8e |
||||
dst[nDst+1] = uint8(r - (0xff61 - 0xa1)) |
||||
nDst += 2 |
||||
continue |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
func init() { |
||||
// Check that the hard-coded encode switch covers all tables.
|
||||
if numEncodeTables != 6 { |
||||
panic("bad numEncodeTables") |
||||
} |
||||
} |
||||
@ -0,0 +1,299 @@ |
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package japanese |
||||
|
||||
import ( |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/encoding" |
||||
"golang.org/x/text/encoding/internal" |
||||
"golang.org/x/text/encoding/internal/identifier" |
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// ISO2022JP is the ISO-2022-JP encoding.
|
||||
var ISO2022JP encoding.Encoding = &iso2022JP |
||||
|
||||
var iso2022JP = internal.Encoding{ |
||||
internal.FuncEncoding{iso2022JPNewDecoder, iso2022JPNewEncoder}, |
||||
"ISO-2022-JP", |
||||
identifier.ISO2022JP, |
||||
} |
||||
|
||||
func iso2022JPNewDecoder() transform.Transformer { |
||||
return new(iso2022JPDecoder) |
||||
} |
||||
|
||||
func iso2022JPNewEncoder() transform.Transformer { |
||||
return new(iso2022JPEncoder) |
||||
} |
||||
|
||||
const ( |
||||
asciiState = iota |
||||
katakanaState |
||||
jis0208State |
||||
jis0212State |
||||
) |
||||
|
||||
const asciiEsc = 0x1b |
||||
|
||||
type iso2022JPDecoder int |
||||
|
||||
func (d *iso2022JPDecoder) Reset() { |
||||
*d = asciiState |
||||
} |
||||
|
||||
func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
for ; nSrc < len(src); nSrc += size { |
||||
c0 := src[nSrc] |
||||
if c0 >= utf8.RuneSelf { |
||||
r, size = '\ufffd', 1 |
||||
goto write |
||||
} |
||||
|
||||
if c0 == asciiEsc { |
||||
if nSrc+2 >= len(src) { |
||||
if !atEOF { |
||||
return nDst, nSrc, transform.ErrShortSrc |
||||
} |
||||
// TODO: is it correct to only skip 1??
|
||||
r, size = '\ufffd', 1 |
||||
goto write |
||||
} |
||||
size = 3 |
||||
c1 := src[nSrc+1] |
||||
c2 := src[nSrc+2] |
||||
switch { |
||||
case c1 == '$' && (c2 == '@' || c2 == 'B'): // 0x24 {0x40, 0x42}
|
||||
*d = jis0208State |
||||
continue |
||||
case c1 == '$' && c2 == '(': // 0x24 0x28
|
||||
if nSrc+3 >= len(src) { |
||||
if !atEOF { |
||||
return nDst, nSrc, transform.ErrShortSrc |
||||
} |
||||
r, size = '\ufffd', 1 |
||||
goto write |
||||
} |
||||
size = 4 |
||||
if src[nSrc+3] == 'D' { |
||||
*d = jis0212State |
||||
continue |
||||
} |
||||
case c1 == '(' && (c2 == 'B' || c2 == 'J'): // 0x28 {0x42, 0x4A}
|
||||
*d = asciiState |
||||
continue |
||||
case c1 == '(' && c2 == 'I': // 0x28 0x49
|
||||
*d = katakanaState |
||||
continue |
||||
} |
||||
r, size = '\ufffd', 1 |
||||
goto write |
||||
} |
||||
|
||||
switch *d { |
||||
case asciiState: |
||||
r, size = rune(c0), 1 |
||||
|
||||
case katakanaState: |
||||
if c0 < 0x21 || 0x60 <= c0 { |
||||
r, size = '\ufffd', 1 |
||||
goto write |
||||
} |
||||
r, size = rune(c0)+(0xff61-0x21), 1 |
||||
|
||||
default: |
||||
if c0 == 0x0a { |
||||
*d = asciiState |
||||
r, size = rune(c0), 1 |
||||
goto write |
||||
} |
||||
if nSrc+1 >= len(src) { |
||||
if !atEOF { |
||||
return nDst, nSrc, transform.ErrShortSrc |
||||
} |
||||
r, size = '\ufffd', 1 |
||||
goto write |
||||
} |
||||
size = 2 |
||||
c1 := src[nSrc+1] |
||||
i := int(c0-0x21)*94 + int(c1-0x21) |
||||
if *d == jis0208State && i < len(jis0208Decode) { |
||||
r = rune(jis0208Decode[i]) |
||||
} else if *d == jis0212State && i < len(jis0212Decode) { |
||||
r = rune(jis0212Decode[i]) |
||||
} else { |
||||
r = '\ufffd' |
||||
goto write |
||||
} |
||||
if r == 0 { |
||||
r = '\ufffd' |
||||
} |
||||
} |
||||
|
||||
write: |
||||
if nDst+utf8.RuneLen(r) > len(dst) { |
||||
return nDst, nSrc, transform.ErrShortDst |
||||
} |
||||
nDst += utf8.EncodeRune(dst[nDst:], r) |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
type iso2022JPEncoder int |
||||
|
||||
func (e *iso2022JPEncoder) Reset() { |
||||
*e = asciiState |
||||
} |
||||
|
||||
func (e *iso2022JPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
for ; nSrc < len(src); nSrc += size { |
||||
r = rune(src[nSrc]) |
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf { |
||||
size = 1 |
||||
|
||||
} else { |
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:]) |
||||
if size == 1 { |
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
} |
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
//
|
||||
// http://encoding.spec.whatwg.org/#iso-2022-jp says that "the index jis0212
|
||||
// is not used by the iso-2022-jp encoder due to lack of widespread support".
|
||||
//
|
||||
// TODO: do we have to special-case U+00A5 and U+203E, as per
|
||||
// http://encoding.spec.whatwg.org/#iso-2022-jp
|
||||
// Doing so would mean that "\u00a5" would not be preserved
|
||||
// after an encode-decode round trip.
|
||||
switch { |
||||
case encode0Low <= r && r < encode0High: |
||||
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 { |
||||
goto writeJIS |
||||
} |
||||
case encode1Low <= r && r < encode1High: |
||||
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 { |
||||
goto writeJIS |
||||
} |
||||
case encode2Low <= r && r < encode2High: |
||||
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 { |
||||
goto writeJIS |
||||
} |
||||
case encode3Low <= r && r < encode3High: |
||||
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 { |
||||
goto writeJIS |
||||
} |
||||
case encode4Low <= r && r < encode4High: |
||||
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 { |
||||
goto writeJIS |
||||
} |
||||
case encode5Low <= r && r < encode5High: |
||||
if 0xff61 <= r && r < 0xffa0 { |
||||
goto writeKatakana |
||||
} |
||||
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 { |
||||
goto writeJIS |
||||
} |
||||
} |
||||
|
||||
// Switch back to ASCII state in case of error so that an ASCII
|
||||
// replacement character can be written in the correct state.
|
||||
if *e != asciiState { |
||||
if nDst+3 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
*e = asciiState |
||||
dst[nDst+0] = asciiEsc |
||||
dst[nDst+1] = '(' |
||||
dst[nDst+2] = 'B' |
||||
nDst += 3 |
||||
} |
||||
err = internal.ErrASCIIReplacement |
||||
break |
||||
} |
||||
|
||||
if *e != asciiState { |
||||
if nDst+4 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
*e = asciiState |
||||
dst[nDst+0] = asciiEsc |
||||
dst[nDst+1] = '(' |
||||
dst[nDst+2] = 'B' |
||||
nDst += 3 |
||||
} else if nDst >= len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = uint8(r) |
||||
nDst++ |
||||
continue |
||||
|
||||
writeJIS: |
||||
if *e != jis0208State { |
||||
if nDst+5 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
*e = jis0208State |
||||
dst[nDst+0] = asciiEsc |
||||
dst[nDst+1] = '$' |
||||
dst[nDst+2] = 'B' |
||||
nDst += 3 |
||||
} else if nDst+2 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst+0] = 0x21 + uint8(r>>codeShift)&codeMask |
||||
dst[nDst+1] = 0x21 + uint8(r)&codeMask |
||||
nDst += 2 |
||||
continue |
||||
|
||||
writeKatakana: |
||||
if *e != katakanaState { |
||||
if nDst+4 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
*e = katakanaState |
||||
dst[nDst+0] = asciiEsc |
||||
dst[nDst+1] = '(' |
||||
dst[nDst+2] = 'I' |
||||
nDst += 3 |
||||
} else if nDst >= len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = uint8(r - (0xff61 - 0x21)) |
||||
nDst++ |
||||
continue |
||||
} |
||||
if atEOF && err == nil && *e != asciiState { |
||||
if nDst+3 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
} else { |
||||
*e = asciiState |
||||
dst[nDst+0] = asciiEsc |
||||
dst[nDst+1] = '(' |
||||
dst[nDst+2] = 'B' |
||||
nDst += 3 |
||||
} |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
@ -0,0 +1,189 @@ |
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package japanese |
||||
|
||||
import ( |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/encoding" |
||||
"golang.org/x/text/encoding/internal" |
||||
"golang.org/x/text/encoding/internal/identifier" |
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// ShiftJIS is the Shift JIS encoding, also known as Code Page 932 and
|
||||
// Windows-31J.
|
||||
var ShiftJIS encoding.Encoding = &shiftJIS |
||||
|
||||
var shiftJIS = internal.Encoding{ |
||||
&internal.SimpleEncoding{shiftJISDecoder{}, shiftJISEncoder{}}, |
||||
"Shift JIS", |
||||
identifier.ShiftJIS, |
||||
} |
||||
|
||||
type shiftJISDecoder struct{ transform.NopResetter } |
||||
|
||||
func (shiftJISDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
loop: |
||||
for ; nSrc < len(src); nSrc += size { |
||||
switch c0 := src[nSrc]; { |
||||
case c0 < utf8.RuneSelf: |
||||
r, size = rune(c0), 1 |
||||
|
||||
case 0xa1 <= c0 && c0 < 0xe0: |
||||
r, size = rune(c0)+(0xff61-0xa1), 1 |
||||
|
||||
case (0x81 <= c0 && c0 < 0xa0) || (0xe0 <= c0 && c0 < 0xfd): |
||||
if c0 <= 0x9f { |
||||
c0 -= 0x70 |
||||
} else { |
||||
c0 -= 0xb0 |
||||
} |
||||
c0 = 2*c0 - 0x21 |
||||
|
||||
if nSrc+1 >= len(src) { |
||||
if !atEOF { |
||||
err = transform.ErrShortSrc |
||||
break loop |
||||
} |
||||
r, size = '\ufffd', 1 |
||||
goto write |
||||
} |
||||
c1 := src[nSrc+1] |
||||
switch { |
||||
case c1 < 0x40: |
||||
r, size = '\ufffd', 1 // c1 is ASCII so output on next round
|
||||
goto write |
||||
case c1 < 0x7f: |
||||
c0-- |
||||
c1 -= 0x40 |
||||
case c1 == 0x7f: |
||||
r, size = '\ufffd', 1 // c1 is ASCII so output on next round
|
||||
goto write |
||||
case c1 < 0x9f: |
||||
c0-- |
||||
c1 -= 0x41 |
||||
case c1 < 0xfd: |
||||
c1 -= 0x9f |
||||
default: |
||||
r, size = '\ufffd', 2 |
||||
goto write |
||||
} |
||||
r, size = '\ufffd', 2 |
||||
if i := int(c0)*94 + int(c1); i < len(jis0208Decode) { |
||||
r = rune(jis0208Decode[i]) |
||||
if r == 0 { |
||||
r = '\ufffd' |
||||
} |
||||
} |
||||
|
||||
case c0 == 0x80: |
||||
r, size = 0x80, 1 |
||||
|
||||
default: |
||||
r, size = '\ufffd', 1 |
||||
} |
||||
write: |
||||
if nDst+utf8.RuneLen(r) > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break loop |
||||
} |
||||
nDst += utf8.EncodeRune(dst[nDst:], r) |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
type shiftJISEncoder struct{ transform.NopResetter } |
||||
|
||||
func (shiftJISEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
loop: |
||||
for ; nSrc < len(src); nSrc += size { |
||||
r = rune(src[nSrc]) |
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf { |
||||
size = 1 |
||||
|
||||
} else { |
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:]) |
||||
if size == 1 { |
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
||||
err = transform.ErrShortSrc |
||||
break loop |
||||
} |
||||
} |
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch { |
||||
case encode0Low <= r && r < encode0High: |
||||
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 { |
||||
goto write2 |
||||
} |
||||
case encode1Low <= r && r < encode1High: |
||||
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 { |
||||
goto write2 |
||||
} |
||||
case encode2Low <= r && r < encode2High: |
||||
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 { |
||||
goto write2 |
||||
} |
||||
case encode3Low <= r && r < encode3High: |
||||
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 { |
||||
goto write2 |
||||
} |
||||
case encode4Low <= r && r < encode4High: |
||||
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 { |
||||
goto write2 |
||||
} |
||||
case encode5Low <= r && r < encode5High: |
||||
if 0xff61 <= r && r < 0xffa0 { |
||||
r -= 0xff61 - 0xa1 |
||||
goto write1 |
||||
} |
||||
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 { |
||||
goto write2 |
||||
} |
||||
} |
||||
err = internal.ErrASCIIReplacement |
||||
break |
||||
} |
||||
|
||||
write1: |
||||
if nDst >= len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = uint8(r) |
||||
nDst++ |
||||
continue |
||||
|
||||
write2: |
||||
j1 := uint8(r>>codeShift) & codeMask |
||||
j2 := uint8(r) & codeMask |
||||
if nDst+2 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break loop |
||||
} |
||||
if j1 <= 61 { |
||||
dst[nDst+0] = 129 + j1/2 |
||||
} else { |
||||
dst[nDst+0] = 193 + j1/2 |
||||
} |
||||
if j1&1 == 0 { |
||||
dst[nDst+1] = j2 + j2/63 + 64 |
||||
} else { |
||||
dst[nDst+1] = j2 + 159 |
||||
} |
||||
nDst += 2 |
||||
continue |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,177 @@ |
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package korean |
||||
|
||||
import ( |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/encoding" |
||||
"golang.org/x/text/encoding/internal" |
||||
"golang.org/x/text/encoding/internal/identifier" |
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All = []encoding.Encoding{EUCKR} |
||||
|
||||
// EUCKR is the EUC-KR encoding, also known as Code Page 949.
|
||||
var EUCKR encoding.Encoding = &eucKR |
||||
|
||||
var eucKR = internal.Encoding{ |
||||
&internal.SimpleEncoding{eucKRDecoder{}, eucKREncoder{}}, |
||||
"EUC-KR", |
||||
identifier.EUCKR, |
||||
} |
||||
|
||||
type eucKRDecoder struct{ transform.NopResetter } |
||||
|
||||
func (eucKRDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
loop: |
||||
for ; nSrc < len(src); nSrc += size { |
||||
switch c0 := src[nSrc]; { |
||||
case c0 < utf8.RuneSelf: |
||||
r, size = rune(c0), 1 |
||||
|
||||
case 0x81 <= c0 && c0 < 0xff: |
||||
if nSrc+1 >= len(src) { |
||||
if !atEOF { |
||||
err = transform.ErrShortSrc |
||||
break loop |
||||
} |
||||
r, size = utf8.RuneError, 1 |
||||
break |
||||
} |
||||
c1 := src[nSrc+1] |
||||
size = 2 |
||||
if c0 < 0xc7 { |
||||
r = 178 * rune(c0-0x81) |
||||
switch { |
||||
case 0x41 <= c1 && c1 < 0x5b: |
||||
r += rune(c1) - (0x41 - 0*26) |
||||
case 0x61 <= c1 && c1 < 0x7b: |
||||
r += rune(c1) - (0x61 - 1*26) |
||||
case 0x81 <= c1 && c1 < 0xff: |
||||
r += rune(c1) - (0x81 - 2*26) |
||||
default: |
||||
goto decError |
||||
} |
||||
} else if 0xa1 <= c1 && c1 < 0xff { |
||||
r = 178*(0xc7-0x81) + rune(c0-0xc7)*94 + rune(c1-0xa1) |
||||
} else { |
||||
goto decError |
||||
} |
||||
if int(r) < len(decode) { |
||||
r = rune(decode[r]) |
||||
if r != 0 { |
||||
break |
||||
} |
||||
} |
||||
decError: |
||||
r = utf8.RuneError |
||||
if c1 < utf8.RuneSelf { |
||||
size = 1 |
||||
} |
||||
|
||||
default: |
||||
r, size = utf8.RuneError, 1 |
||||
break |
||||
} |
||||
|
||||
if nDst+utf8.RuneLen(r) > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
nDst += utf8.EncodeRune(dst[nDst:], r) |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
type eucKREncoder struct{ transform.NopResetter } |
||||
|
||||
func (eucKREncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
for ; nSrc < len(src); nSrc += size { |
||||
r = rune(src[nSrc]) |
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf { |
||||
size = 1 |
||||
|
||||
if nDst >= len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = uint8(r) |
||||
nDst++ |
||||
continue |
||||
|
||||
} else { |
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:]) |
||||
if size == 1 { |
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
} |
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch { |
||||
case encode0Low <= r && r < encode0High: |
||||
if r = rune(encode0[r-encode0Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
case encode1Low <= r && r < encode1High: |
||||
if r = rune(encode1[r-encode1Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
case encode2Low <= r && r < encode2High: |
||||
if r = rune(encode2[r-encode2Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
case encode3Low <= r && r < encode3High: |
||||
if r = rune(encode3[r-encode3Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
case encode4Low <= r && r < encode4High: |
||||
if r = rune(encode4[r-encode4Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
case encode5Low <= r && r < encode5High: |
||||
if r = rune(encode5[r-encode5Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
case encode6Low <= r && r < encode6High: |
||||
if r = rune(encode6[r-encode6Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
} |
||||
err = internal.ErrASCIIReplacement |
||||
break |
||||
} |
||||
|
||||
write2: |
||||
if nDst+2 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst+0] = uint8(r >> 8) |
||||
dst[nDst+1] = uint8(r) |
||||
nDst += 2 |
||||
continue |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
func init() { |
||||
// Check that the hard-coded encode switch covers all tables.
|
||||
if numEncodeTables != 7 { |
||||
panic("bad numEncodeTables") |
||||
} |
||||
} |
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,12 @@ |
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package simplifiedchinese |
||||
|
||||
import ( |
||||
"golang.org/x/text/encoding" |
||||
) |
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All = []encoding.Encoding{GB18030, GBK, HZGB2312} |
||||
@ -0,0 +1,269 @@ |
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package simplifiedchinese |
||||
|
||||
import ( |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/encoding" |
||||
"golang.org/x/text/encoding/internal" |
||||
"golang.org/x/text/encoding/internal/identifier" |
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
var ( |
||||
// GB18030 is the GB18030 encoding.
|
||||
GB18030 encoding.Encoding = &gbk18030 |
||||
// GBK is the GBK encoding. It encodes an extension of the GB2312 character set
|
||||
// and is also known as Code Page 936.
|
||||
GBK encoding.Encoding = &gbk |
||||
) |
||||
|
||||
var gbk = internal.Encoding{ |
||||
&internal.SimpleEncoding{ |
||||
gbkDecoder{gb18030: false}, |
||||
gbkEncoder{gb18030: false}, |
||||
}, |
||||
"GBK", |
||||
identifier.GBK, |
||||
} |
||||
|
||||
var gbk18030 = internal.Encoding{ |
||||
&internal.SimpleEncoding{ |
||||
gbkDecoder{gb18030: true}, |
||||
gbkEncoder{gb18030: true}, |
||||
}, |
||||
"GB18030", |
||||
identifier.GB18030, |
||||
} |
||||
|
||||
type gbkDecoder struct { |
||||
transform.NopResetter |
||||
gb18030 bool |
||||
} |
||||
|
||||
func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
loop: |
||||
for ; nSrc < len(src); nSrc += size { |
||||
switch c0 := src[nSrc]; { |
||||
case c0 < utf8.RuneSelf: |
||||
r, size = rune(c0), 1 |
||||
|
||||
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
|
||||
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
|
||||
// says to treat "gbk" as Code Page 936.
|
||||
case c0 == 0x80: |
||||
r, size = '€', 1 |
||||
|
||||
case c0 < 0xff: |
||||
if nSrc+1 >= len(src) { |
||||
if !atEOF { |
||||
err = transform.ErrShortSrc |
||||
break loop |
||||
} |
||||
r, size = utf8.RuneError, 1 |
||||
goto write |
||||
} |
||||
c1 := src[nSrc+1] |
||||
switch { |
||||
case 0x40 <= c1 && c1 < 0x7f: |
||||
c1 -= 0x40 |
||||
case 0x80 <= c1 && c1 < 0xff: |
||||
c1 -= 0x41 |
||||
case d.gb18030 && 0x30 <= c1 && c1 < 0x40: |
||||
if nSrc+3 >= len(src) { |
||||
if !atEOF { |
||||
err = transform.ErrShortSrc |
||||
break loop |
||||
} |
||||
// The second byte here is always ASCII, so we can set size
|
||||
// to 1 in all cases.
|
||||
r, size = utf8.RuneError, 1 |
||||
goto write |
||||
} |
||||
c2 := src[nSrc+2] |
||||
if c2 < 0x81 || 0xff <= c2 { |
||||
r, size = utf8.RuneError, 1 |
||||
goto write |
||||
} |
||||
c3 := src[nSrc+3] |
||||
if c3 < 0x30 || 0x3a <= c3 { |
||||
r, size = utf8.RuneError, 1 |
||||
goto write |
||||
} |
||||
size = 4 |
||||
r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30) |
||||
if r < 39420 { |
||||
i, j := 0, len(gb18030) |
||||
for i < j { |
||||
h := i + (j-i)/2 |
||||
if r >= rune(gb18030[h][0]) { |
||||
i = h + 1 |
||||
} else { |
||||
j = h |
||||
} |
||||
} |
||||
dec := &gb18030[i-1] |
||||
r += rune(dec[1]) - rune(dec[0]) |
||||
goto write |
||||
} |
||||
r -= 189000 |
||||
if 0 <= r && r < 0x100000 { |
||||
r += 0x10000 |
||||
} else { |
||||
r, size = utf8.RuneError, 1 |
||||
} |
||||
goto write |
||||
default: |
||||
r, size = utf8.RuneError, 1 |
||||
goto write |
||||
} |
||||
r, size = '\ufffd', 2 |
||||
if i := int(c0-0x81)*190 + int(c1); i < len(decode) { |
||||
r = rune(decode[i]) |
||||
if r == 0 { |
||||
r = '\ufffd' |
||||
} |
||||
} |
||||
|
||||
default: |
||||
r, size = utf8.RuneError, 1 |
||||
} |
||||
|
||||
write: |
||||
if nDst+utf8.RuneLen(r) > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break loop |
||||
} |
||||
nDst += utf8.EncodeRune(dst[nDst:], r) |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
type gbkEncoder struct { |
||||
transform.NopResetter |
||||
gb18030 bool |
||||
} |
||||
|
||||
func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, r2, size := rune(0), rune(0), 0 |
||||
for ; nSrc < len(src); nSrc += size { |
||||
r = rune(src[nSrc]) |
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf { |
||||
size = 1 |
||||
|
||||
} else { |
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:]) |
||||
if size == 1 { |
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
} |
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch { |
||||
case encode0Low <= r && r < encode0High: |
||||
if r2 = rune(encode0[r-encode0Low]); r2 != 0 { |
||||
goto write2 |
||||
} |
||||
case encode1Low <= r && r < encode1High: |
||||
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
|
||||
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
|
||||
// says to treat "gbk" as Code Page 936.
|
||||
if r == '€' { |
||||
r = 0x80 |
||||
goto write1 |
||||
} |
||||
if r2 = rune(encode1[r-encode1Low]); r2 != 0 { |
||||
goto write2 |
||||
} |
||||
case encode2Low <= r && r < encode2High: |
||||
if r2 = rune(encode2[r-encode2Low]); r2 != 0 { |
||||
goto write2 |
||||
} |
||||
case encode3Low <= r && r < encode3High: |
||||
if r2 = rune(encode3[r-encode3Low]); r2 != 0 { |
||||
goto write2 |
||||
} |
||||
case encode4Low <= r && r < encode4High: |
||||
if r2 = rune(encode4[r-encode4Low]); r2 != 0 { |
||||
goto write2 |
||||
} |
||||
} |
||||
|
||||
if e.gb18030 { |
||||
if r < 0x10000 { |
||||
i, j := 0, len(gb18030) |
||||
for i < j { |
||||
h := i + (j-i)/2 |
||||
if r >= rune(gb18030[h][1]) { |
||||
i = h + 1 |
||||
} else { |
||||
j = h |
||||
} |
||||
} |
||||
dec := &gb18030[i-1] |
||||
r += rune(dec[0]) - rune(dec[1]) |
||||
goto write4 |
||||
} else if r < 0x110000 { |
||||
r += 189000 - 0x10000 |
||||
goto write4 |
||||
} |
||||
} |
||||
err = internal.ErrASCIIReplacement |
||||
break |
||||
} |
||||
|
||||
write1: |
||||
if nDst >= len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = uint8(r) |
||||
nDst++ |
||||
continue |
||||
|
||||
write2: |
||||
if nDst+2 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst+0] = uint8(r2 >> 8) |
||||
dst[nDst+1] = uint8(r2) |
||||
nDst += 2 |
||||
continue |
||||
|
||||
write4: |
||||
if nDst+4 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst+3] = uint8(r%10 + 0x30) |
||||
r /= 10 |
||||
dst[nDst+2] = uint8(r%126 + 0x81) |
||||
r /= 126 |
||||
dst[nDst+1] = uint8(r%10 + 0x30) |
||||
r /= 10 |
||||
dst[nDst+0] = uint8(r + 0x81) |
||||
nDst += 4 |
||||
continue |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
func init() { |
||||
// Check that the hard-coded encode switch covers all tables.
|
||||
if numEncodeTables != 5 { |
||||
panic("bad numEncodeTables") |
||||
} |
||||
} |
||||
@ -0,0 +1,245 @@ |
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package simplifiedchinese |
||||
|
||||
import ( |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/encoding" |
||||
"golang.org/x/text/encoding/internal" |
||||
"golang.org/x/text/encoding/internal/identifier" |
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// HZGB2312 is the HZ-GB2312 encoding.
|
||||
var HZGB2312 encoding.Encoding = &hzGB2312 |
||||
|
||||
var hzGB2312 = internal.Encoding{ |
||||
internal.FuncEncoding{hzGB2312NewDecoder, hzGB2312NewEncoder}, |
||||
"HZ-GB2312", |
||||
identifier.HZGB2312, |
||||
} |
||||
|
||||
func hzGB2312NewDecoder() transform.Transformer { |
||||
return new(hzGB2312Decoder) |
||||
} |
||||
|
||||
func hzGB2312NewEncoder() transform.Transformer { |
||||
return new(hzGB2312Encoder) |
||||
} |
||||
|
||||
const ( |
||||
asciiState = iota |
||||
gbState |
||||
) |
||||
|
||||
type hzGB2312Decoder int |
||||
|
||||
func (d *hzGB2312Decoder) Reset() { |
||||
*d = asciiState |
||||
} |
||||
|
||||
func (d *hzGB2312Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
loop: |
||||
for ; nSrc < len(src); nSrc += size { |
||||
c0 := src[nSrc] |
||||
if c0 >= utf8.RuneSelf { |
||||
r, size = utf8.RuneError, 1 |
||||
goto write |
||||
} |
||||
|
||||
if c0 == '~' { |
||||
if nSrc+1 >= len(src) { |
||||
if !atEOF { |
||||
err = transform.ErrShortSrc |
||||
break loop |
||||
} |
||||
r, size = utf8.RuneError, 1 |
||||
goto write |
||||
} |
||||
size = 2 |
||||
switch src[nSrc+1] { |
||||
case '{': |
||||
*d = gbState |
||||
continue |
||||
case '}': |
||||
*d = asciiState |
||||
continue |
||||
case '~': |
||||
if nDst >= len(dst) { |
||||
err = transform.ErrShortDst |
||||
break loop |
||||
} |
||||
dst[nDst] = '~' |
||||
nDst++ |
||||
continue |
||||
case '\n': |
||||
continue |
||||
default: |
||||
r = utf8.RuneError |
||||
goto write |
||||
} |
||||
} |
||||
|
||||
if *d == asciiState { |
||||
r, size = rune(c0), 1 |
||||
} else { |
||||
if nSrc+1 >= len(src) { |
||||
if !atEOF { |
||||
err = transform.ErrShortSrc |
||||
break loop |
||||
} |
||||
r, size = utf8.RuneError, 1 |
||||
goto write |
||||
} |
||||
size = 2 |
||||
c1 := src[nSrc+1] |
||||
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 { |
||||
// error
|
||||
} else if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) { |
||||
r = rune(decode[i]) |
||||
if r != 0 { |
||||
goto write |
||||
} |
||||
} |
||||
if c1 > utf8.RuneSelf { |
||||
// Be consistent and always treat non-ASCII as a single error.
|
||||
size = 1 |
||||
} |
||||
r = utf8.RuneError |
||||
} |
||||
|
||||
write: |
||||
if nDst+utf8.RuneLen(r) > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break loop |
||||
} |
||||
nDst += utf8.EncodeRune(dst[nDst:], r) |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
type hzGB2312Encoder int |
||||
|
||||
func (d *hzGB2312Encoder) Reset() { |
||||
*d = asciiState |
||||
} |
||||
|
||||
func (e *hzGB2312Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
for ; nSrc < len(src); nSrc += size { |
||||
r = rune(src[nSrc]) |
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf { |
||||
size = 1 |
||||
if r == '~' { |
||||
if nDst+2 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst+0] = '~' |
||||
dst[nDst+1] = '~' |
||||
nDst += 2 |
||||
continue |
||||
} else if *e != asciiState { |
||||
if nDst+3 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
*e = asciiState |
||||
dst[nDst+0] = '~' |
||||
dst[nDst+1] = '}' |
||||
nDst += 2 |
||||
} else if nDst >= len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = uint8(r) |
||||
nDst += 1 |
||||
continue |
||||
|
||||
} |
||||
|
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:]) |
||||
if size == 1 { |
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
} |
||||
|
||||
// func init checks that the switch covers all tables.
|
||||
switch { |
||||
case encode0Low <= r && r < encode0High: |
||||
if r = rune(encode0[r-encode0Low]); r != 0 { |
||||
goto writeGB |
||||
} |
||||
case encode1Low <= r && r < encode1High: |
||||
if r = rune(encode1[r-encode1Low]); r != 0 { |
||||
goto writeGB |
||||
} |
||||
case encode2Low <= r && r < encode2High: |
||||
if r = rune(encode2[r-encode2Low]); r != 0 { |
||||
goto writeGB |
||||
} |
||||
case encode3Low <= r && r < encode3High: |
||||
if r = rune(encode3[r-encode3Low]); r != 0 { |
||||
goto writeGB |
||||
} |
||||
case encode4Low <= r && r < encode4High: |
||||
if r = rune(encode4[r-encode4Low]); r != 0 { |
||||
goto writeGB |
||||
} |
||||
} |
||||
|
||||
terminateInASCIIState: |
||||
// Switch back to ASCII state in case of error so that an ASCII
|
||||
// replacement character can be written in the correct state.
|
||||
if *e != asciiState { |
||||
if nDst+2 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst+0] = '~' |
||||
dst[nDst+1] = '}' |
||||
nDst += 2 |
||||
} |
||||
err = internal.ErrASCIIReplacement |
||||
break |
||||
|
||||
writeGB: |
||||
c0 := uint8(r>>8) - 0x80 |
||||
c1 := uint8(r) - 0x80 |
||||
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 { |
||||
goto terminateInASCIIState |
||||
} |
||||
if *e == asciiState { |
||||
if nDst+4 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
*e = gbState |
||||
dst[nDst+0] = '~' |
||||
dst[nDst+1] = '{' |
||||
nDst += 2 |
||||
} else if nDst+2 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst+0] = c0 |
||||
dst[nDst+1] = c1 |
||||
nDst += 2 |
||||
continue |
||||
} |
||||
// TODO: should one always terminate in ASCII state to make it safe to
|
||||
// concatenate two HZ-GB2312-encoded strings?
|
||||
return nDst, nSrc, err |
||||
} |
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,199 @@ |
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package traditionalchinese |
||||
|
||||
import ( |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/encoding" |
||||
"golang.org/x/text/encoding/internal" |
||||
"golang.org/x/text/encoding/internal/identifier" |
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// All is a list of all defined encodings in this package.
|
||||
var All = []encoding.Encoding{Big5} |
||||
|
||||
// Big5 is the Big5 encoding, also known as Code Page 950.
|
||||
var Big5 encoding.Encoding = &big5 |
||||
|
||||
var big5 = internal.Encoding{ |
||||
&internal.SimpleEncoding{big5Decoder{}, big5Encoder{}}, |
||||
"Big5", |
||||
identifier.Big5, |
||||
} |
||||
|
||||
type big5Decoder struct{ transform.NopResetter } |
||||
|
||||
func (big5Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size, s := rune(0), 0, "" |
||||
loop: |
||||
for ; nSrc < len(src); nSrc += size { |
||||
switch c0 := src[nSrc]; { |
||||
case c0 < utf8.RuneSelf: |
||||
r, size = rune(c0), 1 |
||||
|
||||
case 0x81 <= c0 && c0 < 0xff: |
||||
if nSrc+1 >= len(src) { |
||||
if !atEOF { |
||||
err = transform.ErrShortSrc |
||||
break loop |
||||
} |
||||
r, size = utf8.RuneError, 1 |
||||
goto write |
||||
} |
||||
c1 := src[nSrc+1] |
||||
switch { |
||||
case 0x40 <= c1 && c1 < 0x7f: |
||||
c1 -= 0x40 |
||||
case 0xa1 <= c1 && c1 < 0xff: |
||||
c1 -= 0x62 |
||||
case c1 < 0x40: |
||||
r, size = utf8.RuneError, 1 |
||||
goto write |
||||
default: |
||||
r, size = utf8.RuneError, 2 |
||||
goto write |
||||
} |
||||
r, size = '\ufffd', 2 |
||||
if i := int(c0-0x81)*157 + int(c1); i < len(decode) { |
||||
if 1133 <= i && i < 1167 { |
||||
// The two-rune special cases for LATIN CAPITAL / SMALL E WITH CIRCUMFLEX
|
||||
// AND MACRON / CARON are from http://encoding.spec.whatwg.org/#big5
|
||||
switch i { |
||||
case 1133: |
||||
s = "\u00CA\u0304" |
||||
goto writeStr |
||||
case 1135: |
||||
s = "\u00CA\u030C" |
||||
goto writeStr |
||||
case 1164: |
||||
s = "\u00EA\u0304" |
||||
goto writeStr |
||||
case 1166: |
||||
s = "\u00EA\u030C" |
||||
goto writeStr |
||||
} |
||||
} |
||||
r = rune(decode[i]) |
||||
if r == 0 { |
||||
r = '\ufffd' |
||||
} |
||||
} |
||||
|
||||
default: |
||||
r, size = utf8.RuneError, 1 |
||||
} |
||||
|
||||
write: |
||||
if nDst+utf8.RuneLen(r) > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break loop |
||||
} |
||||
nDst += utf8.EncodeRune(dst[nDst:], r) |
||||
continue loop |
||||
|
||||
writeStr: |
||||
if nDst+len(s) > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break loop |
||||
} |
||||
nDst += copy(dst[nDst:], s) |
||||
continue loop |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
type big5Encoder struct{ transform.NopResetter } |
||||
|
||||
func (big5Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
r, size := rune(0), 0 |
||||
for ; nSrc < len(src); nSrc += size { |
||||
r = rune(src[nSrc]) |
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf { |
||||
size = 1 |
||||
if nDst >= len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = uint8(r) |
||||
nDst++ |
||||
continue |
||||
|
||||
} else { |
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:]) |
||||
if size == 1 { |
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
} |
||||
} |
||||
|
||||
if r >= utf8.RuneSelf { |
||||
// func init checks that the switch covers all tables.
|
||||
switch { |
||||
case encode0Low <= r && r < encode0High: |
||||
if r = rune(encode0[r-encode0Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
case encode1Low <= r && r < encode1High: |
||||
if r = rune(encode1[r-encode1Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
case encode2Low <= r && r < encode2High: |
||||
if r = rune(encode2[r-encode2Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
case encode3Low <= r && r < encode3High: |
||||
if r = rune(encode3[r-encode3Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
case encode4Low <= r && r < encode4High: |
||||
if r = rune(encode4[r-encode4Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
case encode5Low <= r && r < encode5High: |
||||
if r = rune(encode5[r-encode5Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
case encode6Low <= r && r < encode6High: |
||||
if r = rune(encode6[r-encode6Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
case encode7Low <= r && r < encode7High: |
||||
if r = rune(encode7[r-encode7Low]); r != 0 { |
||||
goto write2 |
||||
} |
||||
} |
||||
err = internal.ErrASCIIReplacement |
||||
break |
||||
} |
||||
|
||||
write2: |
||||
if nDst+2 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst+0] = uint8(r >> 8) |
||||
dst[nDst+1] = uint8(r) |
||||
nDst += 2 |
||||
continue |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
func init() { |
||||
// Check that the hard-coded encode switch covers all tables.
|
||||
if numEncodeTables != 8 { |
||||
panic("bad numEncodeTables") |
||||
} |
||||
} |
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,82 @@ |
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package unicode |
||||
|
||||
import ( |
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// BOMOverride returns a new decoder transformer that is identical to fallback,
|
||||
// except that the presence of a Byte Order Mark at the start of the input
|
||||
// causes it to switch to the corresponding Unicode decoding. It will only
|
||||
// consider BOMs for UTF-8, UTF-16BE, and UTF-16LE.
|
||||
//
|
||||
// This differs from using ExpectBOM by allowing a BOM to switch to UTF-8, not
|
||||
// just UTF-16 variants, and allowing falling back to any encoding scheme.
|
||||
//
|
||||
// This technique is recommended by the W3C for use in HTML 5: "For
|
||||
// compatibility with deployed content, the byte order mark (also known as BOM)
|
||||
// is considered more authoritative than anything else."
|
||||
// http://www.w3.org/TR/encoding/#specification-hooks
|
||||
//
|
||||
// Using BOMOverride is mostly intended for use cases where the first characters
|
||||
// of a fallback encoding are known to not be a BOM, for example, for valid HTML
|
||||
// and most encodings.
|
||||
func BOMOverride(fallback transform.Transformer) transform.Transformer { |
||||
// TODO: possibly allow a variadic argument of unicode encodings to allow
|
||||
// specifying details of which fallbacks are supported as well as
|
||||
// specifying the details of the implementations. This would also allow for
|
||||
// support for UTF-32, which should not be supported by default.
|
||||
return &bomOverride{fallback: fallback} |
||||
} |
||||
|
||||
type bomOverride struct { |
||||
fallback transform.Transformer |
||||
current transform.Transformer |
||||
} |
||||
|
||||
func (d *bomOverride) Reset() { |
||||
d.current = nil |
||||
d.fallback.Reset() |
||||
} |
||||
|
||||
var ( |
||||
// TODO: we could use decode functions here, instead of allocating a new
|
||||
// decoder on every NewDecoder as IgnoreBOM decoders can be stateless.
|
||||
utf16le = UTF16(LittleEndian, IgnoreBOM) |
||||
utf16be = UTF16(BigEndian, IgnoreBOM) |
||||
) |
||||
|
||||
const utf8BOM = "\ufeff" |
||||
|
||||
func (d *bomOverride) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
if d.current != nil { |
||||
return d.current.Transform(dst, src, atEOF) |
||||
} |
||||
if len(src) < 3 && !atEOF { |
||||
return 0, 0, transform.ErrShortSrc |
||||
} |
||||
d.current = d.fallback |
||||
bomSize := 0 |
||||
if len(src) >= 2 { |
||||
if src[0] == 0xFF && src[1] == 0xFE { |
||||
d.current = utf16le.NewDecoder() |
||||
bomSize = 2 |
||||
} else if src[0] == 0xFE && src[1] == 0xFF { |
||||
d.current = utf16be.NewDecoder() |
||||
bomSize = 2 |
||||
} else if len(src) >= 3 && |
||||
src[0] == utf8BOM[0] && |
||||
src[1] == utf8BOM[1] && |
||||
src[2] == utf8BOM[2] { |
||||
d.current = transform.Nop |
||||
bomSize = 3 |
||||
} |
||||
} |
||||
if bomSize < len(src) { |
||||
nDst, nSrc, err = d.current.Transform(dst, src[bomSize:], atEOF) |
||||
} |
||||
return nDst, nSrc + bomSize, err |
||||
} |
||||
@ -0,0 +1,512 @@ |
||||
// Copyright 2013 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package unicode provides Unicode encodings such as UTF-16.
|
||||
package unicode // import "golang.org/x/text/encoding/unicode"
|
||||
|
||||
import ( |
||||
"bytes" |
||||
"errors" |
||||
"unicode/utf16" |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/encoding" |
||||
"golang.org/x/text/encoding/internal" |
||||
"golang.org/x/text/encoding/internal/identifier" |
||||
"golang.org/x/text/internal/utf8internal" |
||||
"golang.org/x/text/runes" |
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// TODO: I think the Transformers really should return errors on unmatched
|
||||
// surrogate pairs and odd numbers of bytes. This is not required by RFC 2781,
|
||||
// which leaves it open, but is suggested by WhatWG. It will allow for all error
|
||||
// modes as defined by WhatWG: fatal, HTML and Replacement. This would require
|
||||
// the introduction of some kind of error type for conveying the erroneous code
|
||||
// point.
|
||||
|
||||
// UTF8 is the UTF-8 encoding. It neither removes nor adds byte order marks.
|
||||
var UTF8 encoding.Encoding = utf8enc |
||||
|
||||
// UTF8BOM is an UTF-8 encoding where the decoder strips a leading byte order
|
||||
// mark while the encoder adds one.
|
||||
//
|
||||
// Some editors add a byte order mark as a signature to UTF-8 files. Although
|
||||
// the byte order mark is not useful for detecting byte order in UTF-8, it is
|
||||
// sometimes used as a convention to mark UTF-8-encoded files. This relies on
|
||||
// the observation that the UTF-8 byte order mark is either an illegal or at
|
||||
// least very unlikely sequence in any other character encoding.
|
||||
var UTF8BOM encoding.Encoding = utf8bomEncoding{} |
||||
|
||||
type utf8bomEncoding struct{} |
||||
|
||||
func (utf8bomEncoding) String() string { |
||||
return "UTF-8-BOM" |
||||
} |
||||
|
||||
func (utf8bomEncoding) ID() (identifier.MIB, string) { |
||||
return identifier.Unofficial, "x-utf8bom" |
||||
} |
||||
|
||||
func (utf8bomEncoding) NewEncoder() *encoding.Encoder { |
||||
return &encoding.Encoder{ |
||||
Transformer: &utf8bomEncoder{t: runes.ReplaceIllFormed()}, |
||||
} |
||||
} |
||||
|
||||
func (utf8bomEncoding) NewDecoder() *encoding.Decoder { |
||||
return &encoding.Decoder{Transformer: &utf8bomDecoder{}} |
||||
} |
||||
|
||||
var utf8enc = &internal.Encoding{ |
||||
&internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()}, |
||||
"UTF-8", |
||||
identifier.UTF8, |
||||
} |
||||
|
||||
type utf8bomDecoder struct { |
||||
checked bool |
||||
} |
||||
|
||||
func (t *utf8bomDecoder) Reset() { |
||||
t.checked = false |
||||
} |
||||
|
||||
func (t *utf8bomDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
if !t.checked { |
||||
if !atEOF && len(src) < len(utf8BOM) { |
||||
if len(src) == 0 { |
||||
return 0, 0, nil |
||||
} |
||||
return 0, 0, transform.ErrShortSrc |
||||
} |
||||
if bytes.HasPrefix(src, []byte(utf8BOM)) { |
||||
nSrc += len(utf8BOM) |
||||
src = src[len(utf8BOM):] |
||||
} |
||||
t.checked = true |
||||
} |
||||
nDst, n, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF) |
||||
nSrc += n |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
type utf8bomEncoder struct { |
||||
written bool |
||||
t transform.Transformer |
||||
} |
||||
|
||||
func (t *utf8bomEncoder) Reset() { |
||||
t.written = false |
||||
t.t.Reset() |
||||
} |
||||
|
||||
func (t *utf8bomEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
if !t.written { |
||||
if len(dst) < len(utf8BOM) { |
||||
return nDst, 0, transform.ErrShortDst |
||||
} |
||||
nDst = copy(dst, utf8BOM) |
||||
t.written = true |
||||
} |
||||
n, nSrc, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF) |
||||
nDst += n |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
type utf8Decoder struct{ transform.NopResetter } |
||||
|
||||
func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
var pSrc int // point from which to start copy in src
|
||||
var accept utf8internal.AcceptRange |
||||
|
||||
// The decoder can only make the input larger, not smaller.
|
||||
n := len(src) |
||||
if len(dst) < n { |
||||
err = transform.ErrShortDst |
||||
n = len(dst) |
||||
atEOF = false |
||||
} |
||||
for nSrc < n { |
||||
c := src[nSrc] |
||||
if c < utf8.RuneSelf { |
||||
nSrc++ |
||||
continue |
||||
} |
||||
first := utf8internal.First[c] |
||||
size := int(first & utf8internal.SizeMask) |
||||
if first == utf8internal.FirstInvalid { |
||||
goto handleInvalid // invalid starter byte
|
||||
} |
||||
accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift] |
||||
if nSrc+size > n { |
||||
if !atEOF { |
||||
// We may stop earlier than necessary here if the short sequence
|
||||
// has invalid bytes. Not checking for this simplifies the code
|
||||
// and may avoid duplicate computations in certain conditions.
|
||||
if err == nil { |
||||
err = transform.ErrShortSrc |
||||
} |
||||
break |
||||
} |
||||
// Determine the maximal subpart of an ill-formed subsequence.
|
||||
switch { |
||||
case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]: |
||||
size = 1 |
||||
case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]: |
||||
size = 2 |
||||
default: |
||||
size = 3 // As we are short, the maximum is 3.
|
||||
} |
||||
goto handleInvalid |
||||
} |
||||
if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c { |
||||
size = 1 |
||||
goto handleInvalid // invalid continuation byte
|
||||
} else if size == 2 { |
||||
} else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c { |
||||
size = 2 |
||||
goto handleInvalid // invalid continuation byte
|
||||
} else if size == 3 { |
||||
} else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c { |
||||
size = 3 |
||||
goto handleInvalid // invalid continuation byte
|
||||
} |
||||
nSrc += size |
||||
continue |
||||
|
||||
handleInvalid: |
||||
// Copy the scanned input so far.
|
||||
nDst += copy(dst[nDst:], src[pSrc:nSrc]) |
||||
|
||||
// Append RuneError to the destination.
|
||||
const runeError = "\ufffd" |
||||
if nDst+len(runeError) > len(dst) { |
||||
return nDst, nSrc, transform.ErrShortDst |
||||
} |
||||
nDst += copy(dst[nDst:], runeError) |
||||
|
||||
// Skip the maximal subpart of an ill-formed subsequence according to
|
||||
// the W3C standard way instead of the Go way. This Transform is
|
||||
// probably the only place in the text repo where it is warranted.
|
||||
nSrc += size |
||||
pSrc = nSrc |
||||
|
||||
// Recompute the maximum source length.
|
||||
if sz := len(dst) - nDst; sz < len(src)-nSrc { |
||||
err = transform.ErrShortDst |
||||
n = nSrc + sz |
||||
atEOF = false |
||||
} |
||||
} |
||||
return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err |
||||
} |
||||
|
||||
// UTF16 returns a UTF-16 Encoding for the given default endianness and byte
|
||||
// order mark (BOM) policy.
|
||||
//
|
||||
// When decoding from UTF-16 to UTF-8, if the BOMPolicy is IgnoreBOM then
|
||||
// neither BOMs U+FEFF nor noncharacters U+FFFE in the input stream will affect
|
||||
// the endianness used for decoding, and will instead be output as their
|
||||
// standard UTF-8 encodings: "\xef\xbb\xbf" and "\xef\xbf\xbe". If the BOMPolicy
|
||||
// is UseBOM or ExpectBOM a staring BOM is not written to the UTF-8 output.
|
||||
// Instead, it overrides the default endianness e for the remainder of the
|
||||
// transformation. Any subsequent BOMs U+FEFF or noncharacters U+FFFE will not
|
||||
// affect the endianness used, and will instead be output as their standard
|
||||
// UTF-8 encodings. For UseBOM, if there is no starting BOM, it will proceed
|
||||
// with the default Endianness. For ExpectBOM, in that case, the transformation
|
||||
// will return early with an ErrMissingBOM error.
|
||||
//
|
||||
// When encoding from UTF-8 to UTF-16, a BOM will be inserted at the start of
|
||||
// the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM will not
|
||||
// be inserted. The UTF-8 input does not need to contain a BOM.
|
||||
//
|
||||
// There is no concept of a 'native' endianness. If the UTF-16 data is produced
|
||||
// and consumed in a greater context that implies a certain endianness, use
|
||||
// IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM.
|
||||
//
|
||||
// In the language of https://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM
|
||||
// corresponds to "Where the precise type of the data stream is known... the
|
||||
// BOM should not be used" and ExpectBOM corresponds to "A particular
|
||||
// protocol... may require use of the BOM".
|
||||
func UTF16(e Endianness, b BOMPolicy) encoding.Encoding { |
||||
return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]} |
||||
} |
||||
|
||||
// mibValue maps Endianness and BOMPolicy settings to MIB constants. Note that
|
||||
// some configurations map to the same MIB identifier. RFC 2781 has requirements
|
||||
// and recommendations. Some of the "configurations" are merely recommendations,
|
||||
// so multiple configurations could match.
|
||||
var mibValue = map[Endianness][numBOMValues]identifier.MIB{ |
||||
BigEndian: [numBOMValues]identifier.MIB{ |
||||
IgnoreBOM: identifier.UTF16BE, |
||||
UseBOM: identifier.UTF16, // BigEnding default is preferred by RFC 2781.
|
||||
// TODO: acceptBOM | strictBOM would map to UTF16BE as well.
|
||||
}, |
||||
LittleEndian: [numBOMValues]identifier.MIB{ |
||||
IgnoreBOM: identifier.UTF16LE, |
||||
UseBOM: identifier.UTF16, // LittleEndian default is allowed and preferred on Windows.
|
||||
// TODO: acceptBOM | strictBOM would map to UTF16LE as well.
|
||||
}, |
||||
// ExpectBOM is not widely used and has no valid MIB identifier.
|
||||
} |
||||
|
||||
// All lists a configuration for each IANA-defined UTF-16 variant.
|
||||
var All = []encoding.Encoding{ |
||||
UTF8, |
||||
UTF16(BigEndian, UseBOM), |
||||
UTF16(BigEndian, IgnoreBOM), |
||||
UTF16(LittleEndian, IgnoreBOM), |
||||
} |
||||
|
||||
// BOMPolicy is a UTF-16 encoding's byte order mark policy.
|
||||
type BOMPolicy uint8 |
||||
|
||||
const ( |
||||
writeBOM BOMPolicy = 0x01 |
||||
acceptBOM BOMPolicy = 0x02 |
||||
requireBOM BOMPolicy = 0x04 |
||||
bomMask BOMPolicy = 0x07 |
||||
|
||||
// HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
|
||||
// map of an array of length 8 of a type that is also used as a key or value
|
||||
// in another map). See golang.org/issue/11354.
|
||||
// TODO: consider changing this value back to 8 if the use of 1.4.* has
|
||||
// been minimized.
|
||||
numBOMValues = 8 + 1 |
||||
|
||||
// IgnoreBOM means to ignore any byte order marks.
|
||||
IgnoreBOM BOMPolicy = 0 |
||||
// Common and RFC 2781-compliant interpretation for UTF-16BE/LE.
|
||||
|
||||
// UseBOM means that the UTF-16 form may start with a byte order mark, which
|
||||
// will be used to override the default encoding.
|
||||
UseBOM BOMPolicy = writeBOM | acceptBOM |
||||
// Common and RFC 2781-compliant interpretation for UTF-16.
|
||||
|
||||
// ExpectBOM means that the UTF-16 form must start with a byte order mark,
|
||||
// which will be used to override the default encoding.
|
||||
ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM |
||||
// Used in Java as Unicode (not to be confused with Java's UTF-16) and
|
||||
// ICU's UTF-16,version=1. Not compliant with RFC 2781.
|
||||
|
||||
// TODO (maybe): strictBOM: BOM must match Endianness. This would allow:
|
||||
// - UTF-16(B|L)E,version=1: writeBOM | acceptBOM | requireBOM | strictBOM
|
||||
// (UnicodeBig and UnicodeLittle in Java)
|
||||
// - RFC 2781-compliant, but less common interpretation for UTF-16(B|L)E:
|
||||
// acceptBOM | strictBOM (e.g. assigned to CheckBOM).
|
||||
// This addition would be consistent with supporting ExpectBOM.
|
||||
) |
||||
|
||||
// Endianness is a UTF-16 encoding's default endianness.
|
||||
type Endianness bool |
||||
|
||||
const ( |
||||
// BigEndian is UTF-16BE.
|
||||
BigEndian Endianness = false |
||||
// LittleEndian is UTF-16LE.
|
||||
LittleEndian Endianness = true |
||||
) |
||||
|
||||
// ErrMissingBOM means that decoding UTF-16 input with ExpectBOM did not find a
|
||||
// starting byte order mark.
|
||||
var ErrMissingBOM = errors.New("encoding: missing byte order mark") |
||||
|
||||
type utf16Encoding struct { |
||||
config |
||||
mib identifier.MIB |
||||
} |
||||
|
||||
type config struct { |
||||
endianness Endianness |
||||
bomPolicy BOMPolicy |
||||
} |
||||
|
||||
func (u utf16Encoding) NewDecoder() *encoding.Decoder { |
||||
return &encoding.Decoder{Transformer: &utf16Decoder{ |
||||
initial: u.config, |
||||
current: u.config, |
||||
}} |
||||
} |
||||
|
||||
func (u utf16Encoding) NewEncoder() *encoding.Encoder { |
||||
return &encoding.Encoder{Transformer: &utf16Encoder{ |
||||
endianness: u.endianness, |
||||
initialBOMPolicy: u.bomPolicy, |
||||
currentBOMPolicy: u.bomPolicy, |
||||
}} |
||||
} |
||||
|
||||
func (u utf16Encoding) ID() (mib identifier.MIB, other string) { |
||||
return u.mib, "" |
||||
} |
||||
|
||||
func (u utf16Encoding) String() string { |
||||
e, b := "B", "" |
||||
if u.endianness == LittleEndian { |
||||
e = "L" |
||||
} |
||||
switch u.bomPolicy { |
||||
case ExpectBOM: |
||||
b = "Expect" |
||||
case UseBOM: |
||||
b = "Use" |
||||
case IgnoreBOM: |
||||
b = "Ignore" |
||||
} |
||||
return "UTF-16" + e + "E (" + b + " BOM)" |
||||
} |
||||
|
||||
type utf16Decoder struct { |
||||
initial config |
||||
current config |
||||
} |
||||
|
||||
func (u *utf16Decoder) Reset() { |
||||
u.current = u.initial |
||||
} |
||||
|
||||
func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
if len(src) < 2 && atEOF && u.current.bomPolicy&requireBOM != 0 { |
||||
return 0, 0, ErrMissingBOM |
||||
} |
||||
if len(src) == 0 { |
||||
return 0, 0, nil |
||||
} |
||||
if len(src) >= 2 && u.current.bomPolicy&acceptBOM != 0 { |
||||
switch { |
||||
case src[0] == 0xfe && src[1] == 0xff: |
||||
u.current.endianness = BigEndian |
||||
nSrc = 2 |
||||
case src[0] == 0xff && src[1] == 0xfe: |
||||
u.current.endianness = LittleEndian |
||||
nSrc = 2 |
||||
default: |
||||
if u.current.bomPolicy&requireBOM != 0 { |
||||
return 0, 0, ErrMissingBOM |
||||
} |
||||
} |
||||
u.current.bomPolicy = IgnoreBOM |
||||
} |
||||
|
||||
var r rune |
||||
var dSize, sSize int |
||||
for nSrc < len(src) { |
||||
if nSrc+1 < len(src) { |
||||
x := uint16(src[nSrc+0])<<8 | uint16(src[nSrc+1]) |
||||
if u.current.endianness == LittleEndian { |
||||
x = x>>8 | x<<8 |
||||
} |
||||
r, sSize = rune(x), 2 |
||||
if utf16.IsSurrogate(r) { |
||||
if nSrc+3 < len(src) { |
||||
x = uint16(src[nSrc+2])<<8 | uint16(src[nSrc+3]) |
||||
if u.current.endianness == LittleEndian { |
||||
x = x>>8 | x<<8 |
||||
} |
||||
// Save for next iteration if it is not a high surrogate.
|
||||
if isHighSurrogate(rune(x)) { |
||||
r, sSize = utf16.DecodeRune(r, rune(x)), 4 |
||||
} |
||||
} else if !atEOF { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
} |
||||
if dSize = utf8.RuneLen(r); dSize < 0 { |
||||
r, dSize = utf8.RuneError, 3 |
||||
} |
||||
} else if atEOF { |
||||
// Single trailing byte.
|
||||
r, dSize, sSize = utf8.RuneError, 3, 1 |
||||
} else { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
if nDst+dSize > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
nDst += utf8.EncodeRune(dst[nDst:], r) |
||||
nSrc += sSize |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
|
||||
func isHighSurrogate(r rune) bool { |
||||
return 0xDC00 <= r && r <= 0xDFFF |
||||
} |
||||
|
||||
type utf16Encoder struct { |
||||
endianness Endianness |
||||
initialBOMPolicy BOMPolicy |
||||
currentBOMPolicy BOMPolicy |
||||
} |
||||
|
||||
func (u *utf16Encoder) Reset() { |
||||
u.currentBOMPolicy = u.initialBOMPolicy |
||||
} |
||||
|
||||
func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
if u.currentBOMPolicy&writeBOM != 0 { |
||||
if len(dst) < 2 { |
||||
return 0, 0, transform.ErrShortDst |
||||
} |
||||
dst[0], dst[1] = 0xfe, 0xff |
||||
u.currentBOMPolicy = IgnoreBOM |
||||
nDst = 2 |
||||
} |
||||
|
||||
r, size := rune(0), 0 |
||||
for nSrc < len(src) { |
||||
r = rune(src[nSrc]) |
||||
|
||||
// Decode a 1-byte rune.
|
||||
if r < utf8.RuneSelf { |
||||
size = 1 |
||||
|
||||
} else { |
||||
// Decode a multi-byte rune.
|
||||
r, size = utf8.DecodeRune(src[nSrc:]) |
||||
if size == 1 { |
||||
// All valid runes of size 1 (those below utf8.RuneSelf) were
|
||||
// handled above. We have invalid UTF-8 or we haven't seen the
|
||||
// full character yet.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
} |
||||
} |
||||
|
||||
if r <= 0xffff { |
||||
if nDst+2 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst+0] = uint8(r >> 8) |
||||
dst[nDst+1] = uint8(r) |
||||
nDst += 2 |
||||
} else { |
||||
if nDst+4 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
r1, r2 := utf16.EncodeRune(r) |
||||
dst[nDst+0] = uint8(r1 >> 8) |
||||
dst[nDst+1] = uint8(r1) |
||||
dst[nDst+2] = uint8(r2 >> 8) |
||||
dst[nDst+3] = uint8(r2) |
||||
nDst += 4 |
||||
} |
||||
nSrc += size |
||||
} |
||||
|
||||
if u.endianness == LittleEndian { |
||||
for i := 0; i < nDst; i += 2 { |
||||
dst[i], dst[i+1] = dst[i+1], dst[i] |
||||
} |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
@ -0,0 +1,87 @@ |
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package utf8internal contains low-level utf8-related constants, tables, etc.
|
||||
// that are used internally by the text package.
|
||||
package utf8internal |
||||
|
||||
// The default lowest and highest continuation byte.
|
||||
const ( |
||||
LoCB = 0x80 // 1000 0000
|
||||
HiCB = 0xBF // 1011 1111
|
||||
) |
||||
|
||||
// Constants related to getting information of first bytes of UTF-8 sequences.
|
||||
const ( |
||||
// ASCII identifies a UTF-8 byte as ASCII.
|
||||
ASCII = as |
||||
|
||||
// FirstInvalid indicates a byte is invalid as a first byte of a UTF-8
|
||||
// sequence.
|
||||
FirstInvalid = xx |
||||
|
||||
// SizeMask is a mask for the size bits. Use use x&SizeMask to get the size.
|
||||
SizeMask = 7 |
||||
|
||||
// AcceptShift is the right-shift count for the first byte info byte to get
|
||||
// the index into the AcceptRanges table. See AcceptRanges.
|
||||
AcceptShift = 4 |
||||
|
||||
// The names of these constants are chosen to give nice alignment in the
|
||||
// table below. The first nibble is an index into acceptRanges or F for
|
||||
// special one-byte cases. The second nibble is the Rune length or the
|
||||
// Status for the special one-byte case.
|
||||
xx = 0xF1 // invalid: size 1
|
||||
as = 0xF0 // ASCII: size 1
|
||||
s1 = 0x02 // accept 0, size 2
|
||||
s2 = 0x13 // accept 1, size 3
|
||||
s3 = 0x03 // accept 0, size 3
|
||||
s4 = 0x23 // accept 2, size 3
|
||||
s5 = 0x34 // accept 3, size 4
|
||||
s6 = 0x04 // accept 0, size 4
|
||||
s7 = 0x44 // accept 4, size 4
|
||||
) |
||||
|
||||
// First is information about the first byte in a UTF-8 sequence.
|
||||
var First = [256]uint8{ |
||||
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
|
||||
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
|
||||
// 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
|
||||
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
|
||||
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
|
||||
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
|
||||
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
|
||||
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
|
||||
} |
||||
|
||||
// AcceptRange gives the range of valid values for the second byte in a UTF-8
|
||||
// sequence for any value for First that is not ASCII or FirstInvalid.
|
||||
type AcceptRange struct { |
||||
Lo uint8 // lowest value for second byte.
|
||||
Hi uint8 // highest value for second byte.
|
||||
} |
||||
|
||||
// AcceptRanges is a slice of AcceptRange values. For a given byte sequence b
|
||||
//
|
||||
// AcceptRanges[First[b[0]]>>AcceptShift]
|
||||
//
|
||||
// will give the value of AcceptRange for the multi-byte UTF-8 sequence starting
|
||||
// at b[0].
|
||||
var AcceptRanges = [...]AcceptRange{ |
||||
0: {LoCB, HiCB}, |
||||
1: {0xA0, HiCB}, |
||||
2: {LoCB, 0x9F}, |
||||
3: {0x90, HiCB}, |
||||
4: {LoCB, 0x8F}, |
||||
} |
||||
@ -0,0 +1,187 @@ |
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package runes |
||||
|
||||
import ( |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// Note: below we pass invalid UTF-8 to the tIn and tNotIn transformers as is.
|
||||
// This is done for various reasons:
|
||||
// - To retain the semantics of the Nop transformer: if input is passed to a Nop
|
||||
// one would expect it to be unchanged.
|
||||
// - It would be very expensive to pass a converted RuneError to a transformer:
|
||||
// a transformer might need more source bytes after RuneError, meaning that
|
||||
// the only way to pass it safely is to create a new buffer and manage the
|
||||
// intermingling of RuneErrors and normal input.
|
||||
// - Many transformers leave ill-formed UTF-8 as is, so this is not
|
||||
// inconsistent. Generally ill-formed UTF-8 is only replaced if it is a
|
||||
// logical consequence of the operation (as for Map) or if it otherwise would
|
||||
// pose security concerns (as for Remove).
|
||||
// - An alternative would be to return an error on ill-formed UTF-8, but this
|
||||
// would be inconsistent with other operations.
|
||||
|
||||
// If returns a transformer that applies tIn to consecutive runes for which
|
||||
// s.Contains(r) and tNotIn to consecutive runes for which !s.Contains(r). Reset
|
||||
// is called on tIn and tNotIn at the start of each run. A Nop transformer will
|
||||
// substitute a nil value passed to tIn or tNotIn. Invalid UTF-8 is translated
|
||||
// to RuneError to determine which transformer to apply, but is passed as is to
|
||||
// the respective transformer.
|
||||
func If(s Set, tIn, tNotIn transform.Transformer) Transformer { |
||||
if tIn == nil && tNotIn == nil { |
||||
return Transformer{transform.Nop} |
||||
} |
||||
if tIn == nil { |
||||
tIn = transform.Nop |
||||
} |
||||
if tNotIn == nil { |
||||
tNotIn = transform.Nop |
||||
} |
||||
sIn, ok := tIn.(transform.SpanningTransformer) |
||||
if !ok { |
||||
sIn = dummySpan{tIn} |
||||
} |
||||
sNotIn, ok := tNotIn.(transform.SpanningTransformer) |
||||
if !ok { |
||||
sNotIn = dummySpan{tNotIn} |
||||
} |
||||
|
||||
a := &cond{ |
||||
tIn: sIn, |
||||
tNotIn: sNotIn, |
||||
f: s.Contains, |
||||
} |
||||
a.Reset() |
||||
return Transformer{a} |
||||
} |
||||
|
||||
type dummySpan struct{ transform.Transformer } |
||||
|
||||
func (d dummySpan) Span(src []byte, atEOF bool) (n int, err error) { |
||||
return 0, transform.ErrEndOfSpan |
||||
} |
||||
|
||||
type cond struct { |
||||
tIn, tNotIn transform.SpanningTransformer |
||||
f func(rune) bool |
||||
check func(rune) bool // current check to perform
|
||||
t transform.SpanningTransformer // current transformer to use
|
||||
} |
||||
|
||||
// Reset implements transform.Transformer.
|
||||
func (t *cond) Reset() { |
||||
t.check = t.is |
||||
t.t = t.tIn |
||||
t.t.Reset() // notIn will be reset on first usage.
|
||||
} |
||||
|
||||
func (t *cond) is(r rune) bool { |
||||
if t.f(r) { |
||||
return true |
||||
} |
||||
t.check = t.isNot |
||||
t.t = t.tNotIn |
||||
t.tNotIn.Reset() |
||||
return false |
||||
} |
||||
|
||||
func (t *cond) isNot(r rune) bool { |
||||
if !t.f(r) { |
||||
return true |
||||
} |
||||
t.check = t.is |
||||
t.t = t.tIn |
||||
t.tIn.Reset() |
||||
return false |
||||
} |
||||
|
||||
// This implementation of Span doesn't help all too much, but it needs to be
|
||||
// there to satisfy this package's Transformer interface.
|
||||
// TODO: there are certainly room for improvements, though. For example, if
|
||||
// t.t == transform.Nop (which will a common occurrence) it will save a bundle
|
||||
// to special-case that loop.
|
||||
func (t *cond) Span(src []byte, atEOF bool) (n int, err error) { |
||||
p := 0 |
||||
for n < len(src) && err == nil { |
||||
// Don't process too much at a time as the Spanner that will be
|
||||
// called on this block may terminate early.
|
||||
const maxChunk = 4096 |
||||
max := len(src) |
||||
if v := n + maxChunk; v < max { |
||||
max = v |
||||
} |
||||
atEnd := false |
||||
size := 0 |
||||
current := t.t |
||||
for ; p < max; p += size { |
||||
r := rune(src[p]) |
||||
if r < utf8.RuneSelf { |
||||
size = 1 |
||||
} else if r, size = utf8.DecodeRune(src[p:]); size == 1 { |
||||
if !atEOF && !utf8.FullRune(src[p:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
} |
||||
if !t.check(r) { |
||||
// The next rune will be the start of a new run.
|
||||
atEnd = true |
||||
break |
||||
} |
||||
} |
||||
n2, err2 := current.Span(src[n:p], atEnd || (atEOF && p == len(src))) |
||||
n += n2 |
||||
if err2 != nil { |
||||
return n, err2 |
||||
} |
||||
// At this point either err != nil or t.check will pass for the rune at p.
|
||||
p = n + size |
||||
} |
||||
return n, err |
||||
} |
||||
|
||||
func (t *cond) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
p := 0 |
||||
for nSrc < len(src) && err == nil { |
||||
// Don't process too much at a time, as the work might be wasted if the
|
||||
// destination buffer isn't large enough to hold the result or a
|
||||
// transform returns an error early.
|
||||
const maxChunk = 4096 |
||||
max := len(src) |
||||
if n := nSrc + maxChunk; n < len(src) { |
||||
max = n |
||||
} |
||||
atEnd := false |
||||
size := 0 |
||||
current := t.t |
||||
for ; p < max; p += size { |
||||
r := rune(src[p]) |
||||
if r < utf8.RuneSelf { |
||||
size = 1 |
||||
} else if r, size = utf8.DecodeRune(src[p:]); size == 1 { |
||||
if !atEOF && !utf8.FullRune(src[p:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
} |
||||
if !t.check(r) { |
||||
// The next rune will be the start of a new run.
|
||||
atEnd = true |
||||
break |
||||
} |
||||
} |
||||
nDst2, nSrc2, err2 := current.Transform(dst[nDst:], src[nSrc:p], atEnd || (atEOF && p == len(src))) |
||||
nDst += nDst2 |
||||
nSrc += nSrc2 |
||||
if err2 != nil { |
||||
return nDst, nSrc, err2 |
||||
} |
||||
// At this point either err != nil or t.check will pass for the rune at p.
|
||||
p = nSrc + size |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
@ -0,0 +1,355 @@ |
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package runes provide transforms for UTF-8 encoded text.
|
||||
package runes // import "golang.org/x/text/runes"
|
||||
|
||||
import ( |
||||
"unicode" |
||||
"unicode/utf8" |
||||
|
||||
"golang.org/x/text/transform" |
||||
) |
||||
|
||||
// A Set is a collection of runes.
|
||||
type Set interface { |
||||
// Contains returns true if r is contained in the set.
|
||||
Contains(r rune) bool |
||||
} |
||||
|
||||
type setFunc func(rune) bool |
||||
|
||||
func (s setFunc) Contains(r rune) bool { |
||||
return s(r) |
||||
} |
||||
|
||||
// Note: using funcs here instead of wrapping types result in cleaner
|
||||
// documentation and a smaller API.
|
||||
|
||||
// In creates a Set with a Contains method that returns true for all runes in
|
||||
// the given RangeTable.
|
||||
func In(rt *unicode.RangeTable) Set { |
||||
return setFunc(func(r rune) bool { return unicode.Is(rt, r) }) |
||||
} |
||||
|
||||
// In creates a Set with a Contains method that returns true for all runes not
|
||||
// in the given RangeTable.
|
||||
func NotIn(rt *unicode.RangeTable) Set { |
||||
return setFunc(func(r rune) bool { return !unicode.Is(rt, r) }) |
||||
} |
||||
|
||||
// Predicate creates a Set with a Contains method that returns f(r).
|
||||
func Predicate(f func(rune) bool) Set { |
||||
return setFunc(f) |
||||
} |
||||
|
||||
// Transformer implements the transform.Transformer interface.
|
||||
type Transformer struct { |
||||
t transform.SpanningTransformer |
||||
} |
||||
|
||||
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
return t.t.Transform(dst, src, atEOF) |
||||
} |
||||
|
||||
func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) { |
||||
return t.t.Span(b, atEOF) |
||||
} |
||||
|
||||
func (t Transformer) Reset() { t.t.Reset() } |
||||
|
||||
// Bytes returns a new byte slice with the result of converting b using t. It
|
||||
// calls Reset on t. It returns nil if any error was found. This can only happen
|
||||
// if an error-producing Transformer is passed to If.
|
||||
func (t Transformer) Bytes(b []byte) []byte { |
||||
b, _, err := transform.Bytes(t, b) |
||||
if err != nil { |
||||
return nil |
||||
} |
||||
return b |
||||
} |
||||
|
||||
// String returns a string with the result of converting s using t. It calls
|
||||
// Reset on t. It returns the empty string if any error was found. This can only
|
||||
// happen if an error-producing Transformer is passed to If.
|
||||
func (t Transformer) String(s string) string { |
||||
s, _, err := transform.String(t, s) |
||||
if err != nil { |
||||
return "" |
||||
} |
||||
return s |
||||
} |
||||
|
||||
// TODO:
|
||||
// - Copy: copying strings and bytes in whole-rune units.
|
||||
// - Validation (maybe)
|
||||
// - Well-formed-ness (maybe)
|
||||
|
||||
const runeErrorString = string(utf8.RuneError) |
||||
|
||||
// Remove returns a Transformer that removes runes r for which s.Contains(r).
|
||||
// Illegal input bytes are replaced by RuneError before being passed to f.
|
||||
func Remove(s Set) Transformer { |
||||
if f, ok := s.(setFunc); ok { |
||||
// This little trick cuts the running time of BenchmarkRemove for sets
|
||||
// created by Predicate roughly in half.
|
||||
// TODO: special-case RangeTables as well.
|
||||
return Transformer{remove(f)} |
||||
} |
||||
return Transformer{remove(s.Contains)} |
||||
} |
||||
|
||||
// TODO: remove transform.RemoveFunc.
|
||||
|
||||
type remove func(r rune) bool |
||||
|
||||
func (remove) Reset() {} |
||||
|
||||
// Span implements transform.Spanner.
|
||||
func (t remove) Span(src []byte, atEOF bool) (n int, err error) { |
||||
for r, size := rune(0), 0; n < len(src); { |
||||
if r = rune(src[n]); r < utf8.RuneSelf { |
||||
size = 1 |
||||
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 { |
||||
// Invalid rune.
|
||||
if !atEOF && !utf8.FullRune(src[n:]) { |
||||
err = transform.ErrShortSrc |
||||
} else { |
||||
err = transform.ErrEndOfSpan |
||||
} |
||||
break |
||||
} |
||||
if t(r) { |
||||
err = transform.ErrEndOfSpan |
||||
break |
||||
} |
||||
n += size |
||||
} |
||||
return |
||||
} |
||||
|
||||
// Transform implements transform.Transformer.
|
||||
func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
for r, size := rune(0), 0; nSrc < len(src); { |
||||
if r = rune(src[nSrc]); r < utf8.RuneSelf { |
||||
size = 1 |
||||
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 { |
||||
// Invalid rune.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
// We replace illegal bytes with RuneError. Not doing so might
|
||||
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
|
||||
// The resulting byte sequence may subsequently contain runes
|
||||
// for which t(r) is true that were passed unnoticed.
|
||||
if !t(utf8.RuneError) { |
||||
if nDst+3 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst+0] = runeErrorString[0] |
||||
dst[nDst+1] = runeErrorString[1] |
||||
dst[nDst+2] = runeErrorString[2] |
||||
nDst += 3 |
||||
} |
||||
nSrc++ |
||||
continue |
||||
} |
||||
if t(r) { |
||||
nSrc += size |
||||
continue |
||||
} |
||||
if nDst+size > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
for i := 0; i < size; i++ { |
||||
dst[nDst] = src[nSrc] |
||||
nDst++ |
||||
nSrc++ |
||||
} |
||||
} |
||||
return |
||||
} |
||||
|
||||
// Map returns a Transformer that maps the runes in the input using the given
|
||||
// mapping. Illegal bytes in the input are converted to utf8.RuneError before
|
||||
// being passed to the mapping func.
|
||||
func Map(mapping func(rune) rune) Transformer { |
||||
return Transformer{mapper(mapping)} |
||||
} |
||||
|
||||
type mapper func(rune) rune |
||||
|
||||
func (mapper) Reset() {} |
||||
|
||||
// Span implements transform.Spanner.
|
||||
func (t mapper) Span(src []byte, atEOF bool) (n int, err error) { |
||||
for r, size := rune(0), 0; n < len(src); n += size { |
||||
if r = rune(src[n]); r < utf8.RuneSelf { |
||||
size = 1 |
||||
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 { |
||||
// Invalid rune.
|
||||
if !atEOF && !utf8.FullRune(src[n:]) { |
||||
err = transform.ErrShortSrc |
||||
} else { |
||||
err = transform.ErrEndOfSpan |
||||
} |
||||
break |
||||
} |
||||
if t(r) != r { |
||||
err = transform.ErrEndOfSpan |
||||
break |
||||
} |
||||
} |
||||
return n, err |
||||
} |
||||
|
||||
// Transform implements transform.Transformer.
|
||||
func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
var replacement rune |
||||
var b [utf8.UTFMax]byte |
||||
|
||||
for r, size := rune(0), 0; nSrc < len(src); { |
||||
if r = rune(src[nSrc]); r < utf8.RuneSelf { |
||||
if replacement = t(r); replacement < utf8.RuneSelf { |
||||
if nDst == len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = byte(replacement) |
||||
nDst++ |
||||
nSrc++ |
||||
continue |
||||
} |
||||
size = 1 |
||||
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 { |
||||
// Invalid rune.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
|
||||
if replacement = t(utf8.RuneError); replacement == utf8.RuneError { |
||||
if nDst+3 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst+0] = runeErrorString[0] |
||||
dst[nDst+1] = runeErrorString[1] |
||||
dst[nDst+2] = runeErrorString[2] |
||||
nDst += 3 |
||||
nSrc++ |
||||
continue |
||||
} |
||||
} else if replacement = t(r); replacement == r { |
||||
if nDst+size > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
for i := 0; i < size; i++ { |
||||
dst[nDst] = src[nSrc] |
||||
nDst++ |
||||
nSrc++ |
||||
} |
||||
continue |
||||
} |
||||
|
||||
n := utf8.EncodeRune(b[:], replacement) |
||||
|
||||
if nDst+n > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
for i := 0; i < n; i++ { |
||||
dst[nDst] = b[i] |
||||
nDst++ |
||||
} |
||||
nSrc += size |
||||
} |
||||
return |
||||
} |
||||
|
||||
// ReplaceIllFormed returns a transformer that replaces all input bytes that are
|
||||
// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
|
||||
func ReplaceIllFormed() Transformer { |
||||
return Transformer{&replaceIllFormed{}} |
||||
} |
||||
|
||||
type replaceIllFormed struct{ transform.NopResetter } |
||||
|
||||
func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) { |
||||
for n < len(src) { |
||||
// ASCII fast path.
|
||||
if src[n] < utf8.RuneSelf { |
||||
n++ |
||||
continue |
||||
} |
||||
|
||||
r, size := utf8.DecodeRune(src[n:]) |
||||
|
||||
// Look for a valid non-ASCII rune.
|
||||
if r != utf8.RuneError || size != 1 { |
||||
n += size |
||||
continue |
||||
} |
||||
|
||||
// Look for short source data.
|
||||
if !atEOF && !utf8.FullRune(src[n:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
|
||||
// We have an invalid rune.
|
||||
err = transform.ErrEndOfSpan |
||||
break |
||||
} |
||||
return n, err |
||||
} |
||||
|
||||
func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
||||
for nSrc < len(src) { |
||||
// ASCII fast path.
|
||||
if r := src[nSrc]; r < utf8.RuneSelf { |
||||
if nDst == len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst] = r |
||||
nDst++ |
||||
nSrc++ |
||||
continue |
||||
} |
||||
|
||||
// Look for a valid non-ASCII rune.
|
||||
if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 { |
||||
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
nDst += size |
||||
nSrc += size |
||||
continue |
||||
} |
||||
|
||||
// Look for short source data.
|
||||
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
||||
err = transform.ErrShortSrc |
||||
break |
||||
} |
||||
|
||||
// We have an invalid rune.
|
||||
if nDst+3 > len(dst) { |
||||
err = transform.ErrShortDst |
||||
break |
||||
} |
||||
dst[nDst+0] = runeErrorString[0] |
||||
dst[nDst+1] = runeErrorString[1] |
||||
dst[nDst+2] = runeErrorString[2] |
||||
nDst += 3 |
||||
nSrc++ |
||||
} |
||||
return nDst, nSrc, err |
||||
} |
||||
Loading…
Reference in new issue