Promtail: Add text encoding conversion to file targets (#6395)

Adds support for text encoding conversion for file targets. To use it, add `encoding: <encoding_name>` into the scrapeconfig.
pull/6618/head
Dylan Guedes 4 years ago committed by GitHub
parent 2f192bbb25
commit f17d3d768c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      CHANGELOG.md
  2. 1
      clients/pkg/promtail/scrapeconfig/scrapeconfig.go
  3. 6
      clients/pkg/promtail/targets/file/filetarget.go
  4. 6
      clients/pkg/promtail/targets/file/filetarget_test.go
  5. 5
      clients/pkg/promtail/targets/file/filetargetmanager.go
  6. 9
      clients/pkg/promtail/targets/file/metrics.go
  7. 44
      clients/pkg/promtail/targets/file/tailer.go
  8. 3
      docs/sources/clients/promtail/configuration.md
  9. 2
      go.mod
  10. 249
      vendor/golang.org/x/text/encoding/charmap/charmap.go
  11. 7410
      vendor/golang.org/x/text/encoding/charmap/tables.go
  12. 335
      vendor/golang.org/x/text/encoding/encoding.go
  13. 74
      vendor/golang.org/x/text/encoding/ianaindex/ascii.go
  14. 214
      vendor/golang.org/x/text/encoding/ianaindex/ianaindex.go
  15. 2348
      vendor/golang.org/x/text/encoding/ianaindex/tables.go
  16. 81
      vendor/golang.org/x/text/encoding/internal/identifier/identifier.go
  17. 1619
      vendor/golang.org/x/text/encoding/internal/identifier/mib.go
  18. 75
      vendor/golang.org/x/text/encoding/internal/internal.go
  19. 12
      vendor/golang.org/x/text/encoding/japanese/all.go
  20. 225
      vendor/golang.org/x/text/encoding/japanese/eucjp.go
  21. 299
      vendor/golang.org/x/text/encoding/japanese/iso2022jp.go
  22. 189
      vendor/golang.org/x/text/encoding/japanese/shiftjis.go
  23. 26971
      vendor/golang.org/x/text/encoding/japanese/tables.go
  24. 177
      vendor/golang.org/x/text/encoding/korean/euckr.go
  25. 34152
      vendor/golang.org/x/text/encoding/korean/tables.go
  26. 12
      vendor/golang.org/x/text/encoding/simplifiedchinese/all.go
  27. 269
      vendor/golang.org/x/text/encoding/simplifiedchinese/gbk.go
  28. 245
      vendor/golang.org/x/text/encoding/simplifiedchinese/hzgb2312.go
  29. 43999
      vendor/golang.org/x/text/encoding/simplifiedchinese/tables.go
  30. 199
      vendor/golang.org/x/text/encoding/traditionalchinese/big5.go
  31. 37142
      vendor/golang.org/x/text/encoding/traditionalchinese/tables.go
  32. 82
      vendor/golang.org/x/text/encoding/unicode/override.go
  33. 512
      vendor/golang.org/x/text/encoding/unicode/unicode.go
  34. 87
      vendor/golang.org/x/text/internal/utf8internal/utf8internal.go
  35. 187
      vendor/golang.org/x/text/runes/cond.go
  36. 355
      vendor/golang.org/x/text/runes/runes.go
  37. 12
      vendor/modules.txt

@ -110,6 +110,7 @@
##### Enhancements
* [5715](https://github.com/grafana/loki/pull/5715) **chaudum**: Allow promtail to push RFC5424 formatted syslog messages
* [6395](https://github.com/grafana/loki/pull/6395) **DylanGuedes**: Add encoding support
##### Fixes
* [6034](https://github.com/grafana/loki/pull/6034) **DylanGuedes**: Promtail: Fix symlink tailing behavior.
##### Changes

@ -47,6 +47,7 @@ type Config struct {
// List of Docker service discovery configurations.
DockerSDConfigs []*moby.DockerSDConfig `yaml:"docker_sd_configs,omitempty"`
ServiceDiscoveryConfig ServiceDiscoveryConfig `yaml:",inline"`
Encoding string `yaml:"encoding,omitempty"`
}
type ServiceDiscoveryConfig struct {

@ -76,6 +76,8 @@ type FileTarget struct {
tails map[string]*tailer
targetConfig *Config
encoding string
}
// NewFileTarget create a new FileTarget.
@ -91,6 +93,7 @@ func NewFileTarget(
targetConfig *Config,
fileEventWatcher chan fsnotify.Event,
targetEventHandler chan fileTargetEvent,
encoding string,
) (*FileTarget, error) {
t := &FileTarget{
logger: logger,
@ -107,6 +110,7 @@ func NewFileTarget(
targetConfig: targetConfig,
fileEventWatcher: fileEventWatcher,
targetEventHandler: targetEventHandler,
encoding: encoding,
}
go t.run()
@ -316,7 +320,7 @@ func (t *FileTarget) startTailing(ps []string) {
}
level.Debug(t.logger).Log("msg", "tailing new file", "filename", p)
tailer, err := newTailer(t.metrics, t.logger, t.handler, t.positions, p)
tailer, err := newTailer(t.metrics, t.logger, t.handler, t.positions, p, t.encoding)
if err != nil {
level.Error(t.logger).Log("msg", "failed to start tailer", "error", err, "filename", p)
continue

@ -69,7 +69,7 @@ func TestFileTargetSync(t *testing.T) {
path := logDir1 + "/*.log"
target, err := NewFileTarget(metrics, logger, client, ps, path, "", nil, nil, &Config{
SyncPeriod: 1 * time.Minute, // assure the sync is not called by the ticker
}, nil, fakeHandler)
}, nil, fakeHandler, "")
assert.NoError(t, err)
// Start with nothing watched.
@ -221,7 +221,7 @@ func TestFileTargetPathExclusion(t *testing.T) {
pathExclude := filepath.Join(dirName, "log3", "*.log")
target, err := NewFileTarget(metrics, logger, client, ps, path, pathExclude, nil, nil, &Config{
SyncPeriod: 1 * time.Minute, // assure the sync is not called by the ticker
}, nil, fakeHandler)
}, nil, fakeHandler, "")
assert.NoError(t, err)
// Start with nothing watched.
@ -346,7 +346,7 @@ func TestHandleFileCreationEvent(t *testing.T) {
target, err := NewFileTarget(metrics, logger, client, ps, path, "", nil, nil, &Config{
// To handle file creation event from channel, set enough long time as sync period
SyncPeriod: 10 * time.Minute,
}, fakeFileHandler, fakeTargetHandler)
}, fakeFileHandler, fakeTargetHandler, "")
if err != nil {
t.Fatal(err)
}

@ -130,6 +130,7 @@ func NewFileTargetManager(
entryHandler: pipeline.Wrap(client),
targetConfig: targetConfig,
fileEventWatchers: map[string]chan fsnotify.Event{},
encoding: cfg.Encoding,
}
tm.syncers[cfg.JobName] = s
configs[cfg.JobName] = cfg.ServiceDiscoveryConfig.Configs()
@ -260,6 +261,8 @@ type targetSyncer struct {
relabelConfig []*relabel.Config
targetConfig *Config
encoding string
}
// sync synchronize target based on received target groups received by service discovery
@ -395,7 +398,7 @@ func (s *targetSyncer) sendFileCreateEvent(event fsnotify.Event) {
}
func (s *targetSyncer) newTarget(path, pathExclude string, labels model.LabelSet, discoveredLabels model.LabelSet, fileEventWatcher chan fsnotify.Event, targetEventHandler chan fileTargetEvent) (*FileTarget, error) {
return NewFileTarget(s.metrics, s.log, s.entryHandler, s.positions, path, pathExclude, labels, discoveredLabels, s.targetConfig, fileEventWatcher, targetEventHandler)
return NewFileTarget(s.metrics, s.log, s.entryHandler, s.positions, path, pathExclude, labels, discoveredLabels, s.targetConfig, fileEventWatcher, targetEventHandler, s.encoding)
}
func (s *targetSyncer) DroppedTargets() []target.Target {

@ -8,10 +8,11 @@ type Metrics struct {
reg prometheus.Registerer
// File-specific metrics
readBytes *prometheus.GaugeVec
totalBytes *prometheus.GaugeVec
readLines *prometheus.CounterVec
filesActive prometheus.Gauge
readBytes *prometheus.GaugeVec
totalBytes *prometheus.GaugeVec
readLines *prometheus.CounterVec
encodingFailures *prometheus.CounterVec
filesActive prometheus.Gauge
// Manager metrics
failedTargets *prometheus.CounterVec

@ -1,6 +1,7 @@
package file
import (
"fmt"
"os"
"sync"
"time"
@ -8,8 +9,12 @@ import (
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/hpcloud/tail"
"github.com/pkg/errors"
"github.com/prometheus/common/model"
"go.uber.org/atomic"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/ianaindex"
"golang.org/x/text/transform"
"github.com/grafana/loki/clients/pkg/promtail/api"
"github.com/grafana/loki/clients/pkg/promtail/positions"
@ -34,9 +39,11 @@ type tailer struct {
posquit chan struct{}
posdone chan struct{}
done chan struct{}
decoder *encoding.Decoder
}
func newTailer(metrics *Metrics, logger log.Logger, handler api.EntryHandler, positions positions.Positions, path string) (*tailer, error) {
func newTailer(metrics *Metrics, logger log.Logger, handler api.EntryHandler, positions positions.Positions, path string, encoding string) (*tailer, error) {
// Simple check to make sure the file we are tailing doesn't
// have a position already saved which is past the end of the file.
fi, err := os.Stat(path)
@ -81,6 +88,16 @@ func newTailer(metrics *Metrics, logger log.Logger, handler api.EntryHandler, po
done: make(chan struct{}),
}
if encoding != "" {
level.Info(tailer.logger).Log("msg", "Will decode messages", "from", encoding, "to", "UTF8")
encoder, err := ianaindex.IANA.Encoding(encoding)
if err != nil {
return nil, errors.Wrap(err, "error doing IANA encoding")
}
decoder := encoder.NewDecoder()
tailer.decoder = decoder
}
go tailer.readLines()
go tailer.updatePosition()
metrics.filesActive.Add(1.)
@ -149,15 +166,27 @@ func (t *tailer) readLines() {
continue
}
var text string
if t.decoder != nil {
var err error
text, err = t.convertToUTF8(line.Text)
if err != nil {
level.Debug(t.logger).Log("msg", "failed to convert encoding", "error", err)
t.metrics.encodingFailures.WithLabelValues(t.path).Inc()
text = fmt.Sprintf("the requested encoding conversion for this line failed in Promtail/Grafana Agent: %s", err.Error())
}
} else {
text = line.Text
}
t.metrics.readLines.WithLabelValues(t.path).Inc()
entries <- api.Entry{
Labels: model.LabelSet{},
Entry: logproto.Entry{
Timestamp: line.Time,
Line: line.Text,
Line: text,
},
}
}
}
@ -217,6 +246,15 @@ func (t *tailer) isRunning() bool {
return t.running.Load()
}
func (t *tailer) convertToUTF8(text string) (string, error) {
res, _, err := transform.String(t.decoder, text)
if err != nil {
return "", errors.Wrap(err, "error decoding text")
}
return res, nil
}
// cleanupMetrics removes all metrics exported by this tailer
func (t *tailer) cleanupMetrics() {
// When we stop tailing the file, also un-export metrics related to the file

@ -318,6 +318,9 @@ job_name: <string>
# Describes how to scrape logs from the journal.
[journal: <journal_config>]
# Describes from which encoding a scraped file should be converted.
[encoding: <iana_encoding_name>]
# Describes how to receive logs from syslog.
[syslog: <syslog_config>]

@ -116,6 +116,7 @@ require (
github.com/prometheus/alertmanager v0.24.0
github.com/prometheus/common/sigv4 v0.1.0
github.com/willf/bloom v2.0.3+incompatible
golang.org/x/text v0.3.7
)
require (
@ -265,7 +266,6 @@ require (
golang.org/x/mod v0.6.0-dev.0.20220106191415-9b9b3d81d5e3 // indirect
golang.org/x/oauth2 v0.0.0-20220524215830-622c5d57e401 // indirect
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/tools v0.1.10 // indirect
golang.org/x/xerrors v0.0.0-20220517211312-f3a8303e98df // indirect
google.golang.org/appengine v1.6.7 // indirect

@ -0,0 +1,249 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run maketables.go
// Package charmap provides simple character encodings such as IBM Code Page 437
// and Windows 1252.
package charmap // import "golang.org/x/text/encoding/charmap"
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// These encodings vary only in the way clients should interpret them. Their
// coded character set is identical and a single implementation can be shared.
var (
// ISO8859_6E is the ISO 8859-6E encoding.
ISO8859_6E encoding.Encoding = &iso8859_6E
// ISO8859_6I is the ISO 8859-6I encoding.
ISO8859_6I encoding.Encoding = &iso8859_6I
// ISO8859_8E is the ISO 8859-8E encoding.
ISO8859_8E encoding.Encoding = &iso8859_8E
// ISO8859_8I is the ISO 8859-8I encoding.
ISO8859_8I encoding.Encoding = &iso8859_8I
iso8859_6E = internal.Encoding{
Encoding: ISO8859_6,
Name: "ISO-8859-6E",
MIB: identifier.ISO88596E,
}
iso8859_6I = internal.Encoding{
Encoding: ISO8859_6,
Name: "ISO-8859-6I",
MIB: identifier.ISO88596I,
}
iso8859_8E = internal.Encoding{
Encoding: ISO8859_8,
Name: "ISO-8859-8E",
MIB: identifier.ISO88598E,
}
iso8859_8I = internal.Encoding{
Encoding: ISO8859_8,
Name: "ISO-8859-8I",
MIB: identifier.ISO88598I,
}
)
// All is a list of all defined encodings in this package.
var All []encoding.Encoding = listAll
// TODO: implement these encodings, in order of importance.
// ASCII, ISO8859_1: Rather common. Close to Windows 1252.
// ISO8859_9: Close to Windows 1254.
// utf8Enc holds a rune's UTF-8 encoding in data[:len].
type utf8Enc struct {
len uint8
data [3]byte
}
// Charmap is an 8-bit character set encoding.
type Charmap struct {
// name is the encoding's name.
name string
// mib is the encoding type of this encoder.
mib identifier.MIB
// asciiSuperset states whether the encoding is a superset of ASCII.
asciiSuperset bool
// low is the lower bound of the encoded byte for a non-ASCII rune. If
// Charmap.asciiSuperset is true then this will be 0x80, otherwise 0x00.
low uint8
// replacement is the encoded replacement character.
replacement byte
// decode is the map from encoded byte to UTF-8.
decode [256]utf8Enc
// encoding is the map from runes to encoded bytes. Each entry is a
// uint32: the high 8 bits are the encoded byte and the low 24 bits are
// the rune. The table entries are sorted by ascending rune.
encode [256]uint32
}
// NewDecoder implements the encoding.Encoding interface.
func (m *Charmap) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: charmapDecoder{charmap: m}}
}
// NewEncoder implements the encoding.Encoding interface.
func (m *Charmap) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: charmapEncoder{charmap: m}}
}
// String returns the Charmap's name.
func (m *Charmap) String() string {
return m.name
}
// ID implements an internal interface.
func (m *Charmap) ID() (mib identifier.MIB, other string) {
return m.mib, ""
}
// charmapDecoder implements transform.Transformer by decoding to UTF-8.
type charmapDecoder struct {
transform.NopResetter
charmap *Charmap
}
func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for i, c := range src {
if m.charmap.asciiSuperset && c < utf8.RuneSelf {
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = c
nDst++
nSrc = i + 1
continue
}
decode := &m.charmap.decode[c]
n := int(decode.len)
if nDst+n > len(dst) {
err = transform.ErrShortDst
break
}
// It's 15% faster to avoid calling copy for these tiny slices.
for j := 0; j < n; j++ {
dst[nDst] = decode.data[j]
nDst++
}
nSrc = i + 1
}
return nDst, nSrc, err
}
// DecodeByte returns the Charmap's rune decoding of the byte b.
func (m *Charmap) DecodeByte(b byte) rune {
switch x := &m.decode[b]; x.len {
case 1:
return rune(x.data[0])
case 2:
return rune(x.data[0]&0x1f)<<6 | rune(x.data[1]&0x3f)
default:
return rune(x.data[0]&0x0f)<<12 | rune(x.data[1]&0x3f)<<6 | rune(x.data[2]&0x3f)
}
}
// charmapEncoder implements transform.Transformer by encoding from UTF-8.
type charmapEncoder struct {
transform.NopResetter
charmap *Charmap
}
func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for nSrc < len(src) {
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
if m.charmap.asciiSuperset {
nSrc++
dst[nDst] = uint8(r)
nDst++
continue
}
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
} else {
err = internal.RepertoireError(m.charmap.replacement)
}
break
}
}
// Binary search in [low, high) for that rune in the m.charmap.encode table.
for low, high := int(m.charmap.low), 0x100; ; {
if low >= high {
err = internal.RepertoireError(m.charmap.replacement)
break loop
}
mid := (low + high) / 2
got := m.charmap.encode[mid]
gotRune := rune(got & (1<<24 - 1))
if gotRune < r {
low = mid + 1
} else if gotRune > r {
high = mid
} else {
dst[nDst] = byte(got >> 24)
nDst++
break
}
}
nSrc += size
}
return nDst, nSrc, err
}
// EncodeRune returns the Charmap's byte encoding of the rune r. ok is whether
// r is in the Charmap's repertoire. If not, b is set to the Charmap's
// replacement byte. This is often the ASCII substitute character '\x1a'.
func (m *Charmap) EncodeRune(r rune) (b byte, ok bool) {
if r < utf8.RuneSelf && m.asciiSuperset {
return byte(r), true
}
for low, high := int(m.low), 0x100; ; {
if low >= high {
return m.replacement, false
}
mid := (low + high) / 2
got := m.encode[mid]
gotRune := rune(got & (1<<24 - 1))
if gotRune < r {
low = mid + 1
} else if gotRune > r {
high = mid
} else {
return byte(got >> 24), true
}
}
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,335 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package encoding defines an interface for character encodings, such as Shift
// JIS and Windows 1252, that can convert to and from UTF-8.
//
// Encoding implementations are provided in other packages, such as
// golang.org/x/text/encoding/charmap and
// golang.org/x/text/encoding/japanese.
package encoding // import "golang.org/x/text/encoding"
import (
"errors"
"io"
"strconv"
"unicode/utf8"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// TODO:
// - There seems to be some inconsistency in when decoders return errors
// and when not. Also documentation seems to suggest they shouldn't return
// errors at all (except for UTF-16).
// - Encoders seem to rely on or at least benefit from the input being in NFC
// normal form. Perhaps add an example how users could prepare their output.
// Encoding is a character set encoding that can be transformed to and from
// UTF-8.
type Encoding interface {
// NewDecoder returns a Decoder.
NewDecoder() *Decoder
// NewEncoder returns an Encoder.
NewEncoder() *Encoder
}
// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
//
// Transforming source bytes that are not of that encoding will not result in an
// error per se. Each byte that cannot be transcoded will be represented in the
// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
type Decoder struct {
transform.Transformer
// This forces external creators of Decoders to use names in struct
// initializers, allowing for future extendibility without having to break
// code.
_ struct{}
}
// Bytes converts the given encoded bytes to UTF-8. It returns the converted
// bytes or nil, err if any error occurred.
func (d *Decoder) Bytes(b []byte) ([]byte, error) {
b, _, err := transform.Bytes(d, b)
if err != nil {
return nil, err
}
return b, nil
}
// String converts the given encoded string to UTF-8. It returns the converted
// string or "", err if any error occurred.
func (d *Decoder) String(s string) (string, error) {
s, _, err := transform.String(d, s)
if err != nil {
return "", err
}
return s, nil
}
// Reader wraps another Reader to decode its bytes.
//
// The Decoder may not be used for any other operation as long as the returned
// Reader is in use.
func (d *Decoder) Reader(r io.Reader) io.Reader {
return transform.NewReader(r, d)
}
// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
//
// Each rune that cannot be transcoded will result in an error. In this case,
// the transform will consume all source byte up to, not including the offending
// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
// `\uFFFD`. To return early with an error instead, use transform.Chain to
// preprocess the data with a UTF8Validator.
type Encoder struct {
transform.Transformer
// This forces external creators of Encoders to use names in struct
// initializers, allowing for future extendibility without having to break
// code.
_ struct{}
}
// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
// any error occurred.
func (e *Encoder) Bytes(b []byte) ([]byte, error) {
b, _, err := transform.Bytes(e, b)
if err != nil {
return nil, err
}
return b, nil
}
// String converts a string from UTF-8. It returns the converted string or
// "", err if any error occurred.
func (e *Encoder) String(s string) (string, error) {
s, _, err := transform.String(e, s)
if err != nil {
return "", err
}
return s, nil
}
// Writer wraps another Writer to encode its UTF-8 output.
//
// The Encoder may not be used for any other operation as long as the returned
// Writer is in use.
func (e *Encoder) Writer(w io.Writer) io.Writer {
return transform.NewWriter(w, e)
}
// ASCIISub is the ASCII substitute character, as recommended by
// https://unicode.org/reports/tr36/#Text_Comparison
const ASCIISub = '\x1a'
// Nop is the nop encoding. Its transformed bytes are the same as the source
// bytes; it does not replace invalid UTF-8 sequences.
var Nop Encoding = nop{}
type nop struct{}
func (nop) NewDecoder() *Decoder {
return &Decoder{Transformer: transform.Nop}
}
func (nop) NewEncoder() *Encoder {
return &Encoder{Transformer: transform.Nop}
}
// Replacement is the replacement encoding. Decoding from the replacement
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
// the replacement encoding yields the same as the source bytes except that
// invalid UTF-8 is converted to '\uFFFD'.
//
// It is defined at http://encoding.spec.whatwg.org/#replacement
var Replacement Encoding = replacement{}
type replacement struct{}
func (replacement) NewDecoder() *Decoder {
return &Decoder{Transformer: replacementDecoder{}}
}
func (replacement) NewEncoder() *Encoder {
return &Encoder{Transformer: replacementEncoder{}}
}
func (replacement) ID() (mib identifier.MIB, other string) {
return identifier.Replacement, ""
}
type replacementDecoder struct{ transform.NopResetter }
func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if len(dst) < 3 {
return 0, 0, transform.ErrShortDst
}
if atEOF {
const fffd = "\ufffd"
dst[0] = fffd[0]
dst[1] = fffd[1]
dst[2] = fffd[2]
nDst = 3
}
return nDst, len(src), nil
}
type replacementEncoder struct{ transform.NopResetter }
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
r = '\ufffd'
}
}
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
// repertoire of the destination encoding with HTML escape sequences.
//
// This wrapper exists to comply to URL and HTML forms requiring a
// non-terminating legacy encoder. The produced sequences may lead to data
// loss as they are indistinguishable from legitimate input. To avoid this
// issue, use UTF-8 encodings whenever possible.
func HTMLEscapeUnsupported(e *Encoder) *Encoder {
return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
}
// ReplaceUnsupported wraps encoders to replace source runes outside the
// repertoire of the destination encoding with an encoding-specific
// replacement.
//
// This wrapper is only provided for backwards compatibility and legacy
// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
func ReplaceUnsupported(e *Encoder) *Encoder {
return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
}
type errorHandler struct {
*Encoder
handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
}
// TODO: consider making this error public in some form.
type repertoireError interface {
Replacement() byte
}
func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
for err != nil {
rerr, ok := err.(repertoireError)
if !ok {
return nDst, nSrc, err
}
r, sz := utf8.DecodeRune(src[nSrc:])
n, ok := h.handler(dst[nDst:], r, rerr)
if !ok {
return nDst, nSrc, transform.ErrShortDst
}
err = nil
nDst += n
if nSrc += sz; nSrc < len(src) {
var dn, sn int
dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
nDst += dn
nSrc += sn
}
}
return nDst, nSrc, err
}
func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
buf := [8]byte{}
b := strconv.AppendUint(buf[:0], uint64(r), 10)
if n = len(b) + len("&#;"); n >= len(dst) {
return 0, false
}
dst[0] = '&'
dst[1] = '#'
dst[copy(dst[2:], b)+2] = ';'
return n, true
}
func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
if len(dst) == 0 {
return 0, false
}
dst[0] = err.Replacement()
return 1, true
}
// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
// input byte that is not valid UTF-8.
var UTF8Validator transform.Transformer = utf8Validator{}
type utf8Validator struct{ transform.NopResetter }
func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
n := len(src)
if n > len(dst) {
n = len(dst)
}
for i := 0; i < n; {
if c := src[i]; c < utf8.RuneSelf {
dst[i] = c
i++
continue
}
_, size := utf8.DecodeRune(src[i:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
err = ErrInvalidUTF8
if !atEOF && !utf8.FullRune(src[i:]) {
err = transform.ErrShortSrc
}
return i, i, err
}
if i+size > len(dst) {
return i, i, transform.ErrShortDst
}
for ; size > 0; size-- {
dst[i] = src[i]
i++
}
}
if len(src) > len(dst) {
err = transform.ErrShortDst
}
return n, n, err
}

@ -0,0 +1,74 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ianaindex
import (
"unicode"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
type asciiDecoder struct {
transform.NopResetter
}
func (d asciiDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for _, c := range src {
if c > unicode.MaxASCII {
r := unicode.ReplacementChar
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
nSrc++
continue
}
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = c
nDst++
nSrc++
}
return nDst, nSrc, err
}
type asciiEncoder struct {
transform.NopResetter
}
func (d asciiEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for _, c := range src {
if c > unicode.MaxASCII {
err = internal.RepertoireError(encoding.ASCIISub)
break
}
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = c
nDst++
nSrc++
}
return nDst, nSrc, err
}
var asciiEnc = &internal.Encoding{
Encoding: &internal.SimpleEncoding{
asciiDecoder{},
asciiEncoder{},
},
Name: "US-ASCII",
MIB: identifier.ASCII,
}

@ -0,0 +1,214 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go
// Package ianaindex maps names to Encodings as specified by the IANA registry.
// This includes both the MIME and IANA names.
//
// See http://www.iana.org/assignments/character-sets/character-sets.xhtml for
// more details.
package ianaindex
import (
"errors"
"sort"
"strings"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/korean"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese"
"golang.org/x/text/encoding/unicode"
)
// TODO: remove the "Status... incomplete" in the package doc comment.
// TODO: allow users to specify their own aliases?
// TODO: allow users to specify their own indexes?
// TODO: allow canonicalizing names
// NOTE: only use these top-level variables if we can get the linker to drop
// the indexes when they are not used. Make them a function or perhaps only
// support MIME otherwise.
var (
// MIME is an index to map MIME names.
MIME *Index = mime
// IANA is an index that supports all names and aliases using IANA names as
// the canonical identifier.
IANA *Index = iana
// MIB is an index that associates the MIB display name with an Encoding.
MIB *Index = mib
mime = &Index{mimeName, ianaToMIB, ianaAliases, encodings[:]}
iana = &Index{ianaName, ianaToMIB, ianaAliases, encodings[:]}
mib = &Index{mibName, ianaToMIB, ianaAliases, encodings[:]}
)
// Index maps names registered by IANA to Encodings.
// Currently different Indexes only differ in the names they return for
// encodings. In the future they may also differ in supported aliases.
type Index struct {
names func(i int) string
toMIB []identifier.MIB // Sorted slice of supported MIBs
alias map[string]int
enc []encoding.Encoding
}
var (
errInvalidName = errors.New("ianaindex: invalid encoding name")
errUnknown = errors.New("ianaindex: unknown Encoding")
errUnsupported = errors.New("ianaindex: unsupported Encoding")
)
// Encoding returns an Encoding for IANA-registered names. Matching is
// case-insensitive.
//
// If the provided name doesn't match a IANA-registered charset, an error is
// returned. If the name matches a IANA-registered charset but isn't supported,
// a nil encoding and a nil error are returned.
func (x *Index) Encoding(name string) (encoding.Encoding, error) {
name = strings.TrimSpace(name)
// First try without lowercasing (possibly creating an allocation).
i, ok := x.alias[name]
if !ok {
i, ok = x.alias[strings.ToLower(name)]
if !ok {
return nil, errInvalidName
}
}
return x.enc[i], nil
}
// Name reports the canonical name of the given Encoding. It will return an
// error if the e is not associated with a known encoding scheme.
func (x *Index) Name(e encoding.Encoding) (string, error) {
id, ok := e.(identifier.Interface)
if !ok {
return "", errUnknown
}
mib, _ := id.ID()
if mib == 0 {
return "", errUnknown
}
v := findMIB(x.toMIB, mib)
if v == -1 {
return "", errUnsupported
}
return x.names(v), nil
}
// TODO: the coverage of this index is rather spotty. Allowing users to set
// encodings would allow:
// - users to increase coverage
// - allow a partially loaded set of encodings in case the user doesn't need to
// them all.
// - write an OS-specific wrapper for supported encodings and set them.
// The exact definition of Set depends a bit on if and how we want to let users
// write their own Encoding implementations. Also, it is not possible yet to
// only partially load the encodings without doing some refactoring. Until this
// is solved, we might as well not support Set.
// // Set sets the e to be used for the encoding scheme identified by name. Only
// // canonical names may be used. An empty name assigns e to its internally
// // associated encoding scheme.
// func (x *Index) Set(name string, e encoding.Encoding) error {
// panic("TODO: implement")
// }
func findMIB(x []identifier.MIB, mib identifier.MIB) int {
i := sort.Search(len(x), func(i int) bool { return x[i] >= mib })
if i < len(x) && x[i] == mib {
return i
}
return -1
}
const maxMIMENameLen = '0' - 1 // officially 40, but we leave some buffer.
func mimeName(x int) string {
n := ianaNames[x]
// See gen.go for a description of the encoding.
if n[0] <= maxMIMENameLen {
return n[1:n[0]]
}
return n
}
func ianaName(x int) string {
n := ianaNames[x]
// See gen.go for a description of the encoding.
if n[0] <= maxMIMENameLen {
return n[n[0]:]
}
return n
}
func mibName(x int) string {
return mibNames[x]
}
var encodings = [numIANA]encoding.Encoding{
enc3: asciiEnc,
enc106: unicode.UTF8,
enc1015: unicode.UTF16(unicode.BigEndian, unicode.UseBOM),
enc1013: unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
enc1014: unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
enc2028: charmap.CodePage037,
enc2011: charmap.CodePage437,
enc2009: charmap.CodePage850,
enc2010: charmap.CodePage852,
enc2046: charmap.CodePage855,
enc2089: charmap.CodePage858,
enc2048: charmap.CodePage860,
enc2013: charmap.CodePage862,
enc2050: charmap.CodePage863,
enc2052: charmap.CodePage865,
enc2086: charmap.CodePage866,
enc2102: charmap.CodePage1047,
enc2091: charmap.CodePage1140,
enc4: charmap.ISO8859_1,
enc5: charmap.ISO8859_2,
enc6: charmap.ISO8859_3,
enc7: charmap.ISO8859_4,
enc8: charmap.ISO8859_5,
enc9: charmap.ISO8859_6,
enc81: charmap.ISO8859_6E,
enc82: charmap.ISO8859_6I,
enc10: charmap.ISO8859_7,
enc11: charmap.ISO8859_8,
enc84: charmap.ISO8859_8E,
enc85: charmap.ISO8859_8I,
enc12: charmap.ISO8859_9,
enc13: charmap.ISO8859_10,
enc109: charmap.ISO8859_13,
enc110: charmap.ISO8859_14,
enc111: charmap.ISO8859_15,
enc112: charmap.ISO8859_16,
enc2084: charmap.KOI8R,
enc2088: charmap.KOI8U,
enc2027: charmap.Macintosh,
enc2109: charmap.Windows874,
enc2250: charmap.Windows1250,
enc2251: charmap.Windows1251,
enc2252: charmap.Windows1252,
enc2253: charmap.Windows1253,
enc2254: charmap.Windows1254,
enc2255: charmap.Windows1255,
enc2256: charmap.Windows1256,
enc2257: charmap.Windows1257,
enc2258: charmap.Windows1258,
enc18: japanese.EUCJP,
enc39: japanese.ISO2022JP,
enc17: japanese.ShiftJIS,
enc38: korean.EUCKR,
enc114: simplifiedchinese.GB18030,
enc113: simplifiedchinese.GBK,
enc2085: simplifiedchinese.HZGB2312,
enc2026: traditionalchinese.Big5,
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,81 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go
// Package identifier defines the contract between implementations of Encoding
// and Index by defining identifiers that uniquely identify standardized coded
// character sets (CCS) and character encoding schemes (CES), which we will
// together refer to as encodings, for which Encoding implementations provide
// converters to and from UTF-8. This package is typically only of concern to
// implementers of Indexes and Encodings.
//
// One part of the identifier is the MIB code, which is defined by IANA and
// uniquely identifies a CCS or CES. Each code is associated with data that
// references authorities, official documentation as well as aliases and MIME
// names.
//
// Not all CESs are covered by the IANA registry. The "other" string that is
// returned by ID can be used to identify other character sets or versions of
// existing ones.
//
// It is recommended that each package that provides a set of Encodings provide
// the All and Common variables to reference all supported encodings and
// commonly used subset. This allows Index implementations to include all
// available encodings without explicitly referencing or knowing about them.
package identifier
// Note: this package is internal, but could be made public if there is a need
// for writing third-party Indexes and Encodings.
// References:
// - http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/convrtrs.txt
// - http://www.iana.org/assignments/character-sets/character-sets.xhtml
// - http://www.iana.org/assignments/ianacharset-mib/ianacharset-mib
// - http://www.ietf.org/rfc/rfc2978.txt
// - https://www.unicode.org/reports/tr22/
// - http://www.w3.org/TR/encoding/
// - https://encoding.spec.whatwg.org/
// - https://encoding.spec.whatwg.org/encodings.json
// - https://tools.ietf.org/html/rfc6657#section-5
// Interface can be implemented by Encodings to define the CCS or CES for which
// it implements conversions.
type Interface interface {
// ID returns an encoding identifier. Exactly one of the mib and other
// values should be non-zero.
//
// In the usual case it is only necessary to indicate the MIB code. The
// other string can be used to specify encodings for which there is no MIB,
// such as "x-mac-dingbat".
//
// The other string may only contain the characters a-z, A-Z, 0-9, - and _.
ID() (mib MIB, other string)
// NOTE: the restrictions on the encoding are to allow extending the syntax
// with additional information such as versions, vendors and other variants.
}
// A MIB identifies an encoding. It is derived from the IANA MIB codes and adds
// some identifiers for some encodings that are not covered by the IANA
// standard.
//
// See http://www.iana.org/assignments/ianacharset-mib.
type MIB uint16
// These additional MIB types are not defined in IANA. They are added because
// they are common and defined within the text repo.
const (
// Unofficial marks the start of encodings not registered by IANA.
Unofficial MIB = 10000 + iota
// Replacement is the WhatWG replacement encoding.
Replacement
// XUserDefined is the code for x-user-defined.
XUserDefined
// MacintoshCyrillic is the code for x-mac-cyrillic.
MacintoshCyrillic
)

File diff suppressed because it is too large Load Diff

@ -0,0 +1,75 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package internal contains code that is shared among encoding implementations.
package internal
import (
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// Encoding is an implementation of the Encoding interface that adds the String
// and ID methods to an existing encoding.
type Encoding struct {
encoding.Encoding
Name string
MIB identifier.MIB
}
// _ verifies that Encoding implements identifier.Interface.
var _ identifier.Interface = (*Encoding)(nil)
func (e *Encoding) String() string {
return e.Name
}
func (e *Encoding) ID() (mib identifier.MIB, other string) {
return e.MIB, ""
}
// SimpleEncoding is an Encoding that combines two Transformers.
type SimpleEncoding struct {
Decoder transform.Transformer
Encoder transform.Transformer
}
func (e *SimpleEncoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: e.Decoder}
}
func (e *SimpleEncoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: e.Encoder}
}
// FuncEncoding is an Encoding that combines two functions returning a new
// Transformer.
type FuncEncoding struct {
Decoder func() transform.Transformer
Encoder func() transform.Transformer
}
func (e FuncEncoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: e.Decoder()}
}
func (e FuncEncoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: e.Encoder()}
}
// A RepertoireError indicates a rune is not in the repertoire of a destination
// encoding. It is associated with an encoding-specific suggested replacement
// byte.
type RepertoireError byte
// Error implements the error interrface.
func (r RepertoireError) Error() string {
return "encoding: rune not supported by encoding."
}
// Replacement returns the replacement string associated with this error.
func (r RepertoireError) Replacement() byte { return byte(r) }
var ErrASCIIReplacement = RepertoireError(encoding.ASCIISub)

@ -0,0 +1,12 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"golang.org/x/text/encoding"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{EUCJP, ISO2022JP, ShiftJIS}

@ -0,0 +1,225 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// EUCJP is the EUC-JP encoding.
var EUCJP encoding.Encoding = &eucJP
var eucJP = internal.Encoding{
&internal.SimpleEncoding{eucJPDecoder{}, eucJPEncoder{}},
"EUC-JP",
identifier.EUCPkdFmtJapanese,
}
type eucJPDecoder struct{ transform.NopResetter }
// See https://encoding.spec.whatwg.org/#euc-jp-decoder.
func (eucJPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case c0 == 0x8e:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
break
}
c1 := src[nSrc+1]
switch {
case c1 < 0xa1:
r, size = utf8.RuneError, 1
case c1 > 0xdf:
r, size = utf8.RuneError, 2
if c1 == 0xff {
size = 1
}
default:
r, size = rune(c1)+(0xff61-0xa1), 2
}
case c0 == 0x8f:
if nSrc+2 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
if p := nSrc + 1; p < len(src) && 0xa1 <= src[p] && src[p] < 0xfe {
size = 2
}
break
}
c1 := src[nSrc+1]
if c1 < 0xa1 || 0xfe < c1 {
r, size = utf8.RuneError, 1
break
}
c2 := src[nSrc+2]
if c2 < 0xa1 || 0xfe < c2 {
r, size = utf8.RuneError, 2
break
}
r, size = utf8.RuneError, 3
if i := int(c1-0xa1)*94 + int(c2-0xa1); i < len(jis0212Decode) {
r = rune(jis0212Decode[i])
if r == 0 {
r = utf8.RuneError
}
}
case 0xa1 <= c0 && c0 <= 0xfe:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
break
}
c1 := src[nSrc+1]
if c1 < 0xa1 || 0xfe < c1 {
r, size = utf8.RuneError, 1
break
}
r, size = utf8.RuneError, 2
if i := int(c0-0xa1)*94 + int(c1-0xa1); i < len(jis0208Decode) {
r = rune(jis0208Decode[i])
if r == 0 {
r = utf8.RuneError
}
}
default:
r, size = utf8.RuneError, 1
}
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type eucJPEncoder struct{ transform.NopResetter }
func (eucJPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2or3
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2or3
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2or3
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2or3
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2or3
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
goto write2
}
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2or3
}
}
err = internal.ErrASCIIReplacement
break
}
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2or3:
if r>>tableShift == jis0208 {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
} else {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = 0x8f
nDst++
}
dst[nDst+0] = 0xa1 + uint8(r>>codeShift)&codeMask
dst[nDst+1] = 0xa1 + uint8(r)&codeMask
nDst += 2
continue
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = 0x8e
dst[nDst+1] = uint8(r - (0xff61 - 0xa1))
nDst += 2
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 6 {
panic("bad numEncodeTables")
}
}

@ -0,0 +1,299 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// ISO2022JP is the ISO-2022-JP encoding.
var ISO2022JP encoding.Encoding = &iso2022JP
var iso2022JP = internal.Encoding{
internal.FuncEncoding{iso2022JPNewDecoder, iso2022JPNewEncoder},
"ISO-2022-JP",
identifier.ISO2022JP,
}
func iso2022JPNewDecoder() transform.Transformer {
return new(iso2022JPDecoder)
}
func iso2022JPNewEncoder() transform.Transformer {
return new(iso2022JPEncoder)
}
const (
asciiState = iota
katakanaState
jis0208State
jis0212State
)
const asciiEsc = 0x1b
type iso2022JPDecoder int
func (d *iso2022JPDecoder) Reset() {
*d = asciiState
}
func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
c0 := src[nSrc]
if c0 >= utf8.RuneSelf {
r, size = '\ufffd', 1
goto write
}
if c0 == asciiEsc {
if nSrc+2 >= len(src) {
if !atEOF {
return nDst, nSrc, transform.ErrShortSrc
}
// TODO: is it correct to only skip 1??
r, size = '\ufffd', 1
goto write
}
size = 3
c1 := src[nSrc+1]
c2 := src[nSrc+2]
switch {
case c1 == '$' && (c2 == '@' || c2 == 'B'): // 0x24 {0x40, 0x42}
*d = jis0208State
continue
case c1 == '$' && c2 == '(': // 0x24 0x28
if nSrc+3 >= len(src) {
if !atEOF {
return nDst, nSrc, transform.ErrShortSrc
}
r, size = '\ufffd', 1
goto write
}
size = 4
if src[nSrc+3] == 'D' {
*d = jis0212State
continue
}
case c1 == '(' && (c2 == 'B' || c2 == 'J'): // 0x28 {0x42, 0x4A}
*d = asciiState
continue
case c1 == '(' && c2 == 'I': // 0x28 0x49
*d = katakanaState
continue
}
r, size = '\ufffd', 1
goto write
}
switch *d {
case asciiState:
r, size = rune(c0), 1
case katakanaState:
if c0 < 0x21 || 0x60 <= c0 {
r, size = '\ufffd', 1
goto write
}
r, size = rune(c0)+(0xff61-0x21), 1
default:
if c0 == 0x0a {
*d = asciiState
r, size = rune(c0), 1
goto write
}
if nSrc+1 >= len(src) {
if !atEOF {
return nDst, nSrc, transform.ErrShortSrc
}
r, size = '\ufffd', 1
goto write
}
size = 2
c1 := src[nSrc+1]
i := int(c0-0x21)*94 + int(c1-0x21)
if *d == jis0208State && i < len(jis0208Decode) {
r = rune(jis0208Decode[i])
} else if *d == jis0212State && i < len(jis0212Decode) {
r = rune(jis0212Decode[i])
} else {
r = '\ufffd'
goto write
}
if r == 0 {
r = '\ufffd'
}
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
return nDst, nSrc, transform.ErrShortDst
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type iso2022JPEncoder int
func (e *iso2022JPEncoder) Reset() {
*e = asciiState
}
func (e *iso2022JPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
//
// http://encoding.spec.whatwg.org/#iso-2022-jp says that "the index jis0212
// is not used by the iso-2022-jp encoder due to lack of widespread support".
//
// TODO: do we have to special-case U+00A5 and U+203E, as per
// http://encoding.spec.whatwg.org/#iso-2022-jp
// Doing so would mean that "\u00a5" would not be preserved
// after an encode-decode round trip.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
goto writeKatakana
}
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
goto writeJIS
}
}
// Switch back to ASCII state in case of error so that an ASCII
// replacement character can be written in the correct state.
if *e != asciiState {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
*e = asciiState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'B'
nDst += 3
}
err = internal.ErrASCIIReplacement
break
}
if *e != asciiState {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
*e = asciiState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'B'
nDst += 3
} else if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
writeJIS:
if *e != jis0208State {
if nDst+5 > len(dst) {
err = transform.ErrShortDst
break
}
*e = jis0208State
dst[nDst+0] = asciiEsc
dst[nDst+1] = '$'
dst[nDst+2] = 'B'
nDst += 3
} else if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = 0x21 + uint8(r>>codeShift)&codeMask
dst[nDst+1] = 0x21 + uint8(r)&codeMask
nDst += 2
continue
writeKatakana:
if *e != katakanaState {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
*e = katakanaState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'I'
nDst += 3
} else if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r - (0xff61 - 0x21))
nDst++
continue
}
if atEOF && err == nil && *e != asciiState {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
} else {
*e = asciiState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'B'
nDst += 3
}
}
return nDst, nSrc, err
}

@ -0,0 +1,189 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// ShiftJIS is the Shift JIS encoding, also known as Code Page 932 and
// Windows-31J.
var ShiftJIS encoding.Encoding = &shiftJIS
var shiftJIS = internal.Encoding{
&internal.SimpleEncoding{shiftJISDecoder{}, shiftJISEncoder{}},
"Shift JIS",
identifier.ShiftJIS,
}
type shiftJISDecoder struct{ transform.NopResetter }
func (shiftJISDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case 0xa1 <= c0 && c0 < 0xe0:
r, size = rune(c0)+(0xff61-0xa1), 1
case (0x81 <= c0 && c0 < 0xa0) || (0xe0 <= c0 && c0 < 0xfd):
if c0 <= 0x9f {
c0 -= 0x70
} else {
c0 -= 0xb0
}
c0 = 2*c0 - 0x21
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = '\ufffd', 1
goto write
}
c1 := src[nSrc+1]
switch {
case c1 < 0x40:
r, size = '\ufffd', 1 // c1 is ASCII so output on next round
goto write
case c1 < 0x7f:
c0--
c1 -= 0x40
case c1 == 0x7f:
r, size = '\ufffd', 1 // c1 is ASCII so output on next round
goto write
case c1 < 0x9f:
c0--
c1 -= 0x41
case c1 < 0xfd:
c1 -= 0x9f
default:
r, size = '\ufffd', 2
goto write
}
r, size = '\ufffd', 2
if i := int(c0)*94 + int(c1); i < len(jis0208Decode) {
r = rune(jis0208Decode[i])
if r == 0 {
r = '\ufffd'
}
}
case c0 == 0x80:
r, size = 0x80, 1
default:
r, size = '\ufffd', 1
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type shiftJISEncoder struct{ transform.NopResetter }
func (shiftJISEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break loop
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
goto write2
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
r -= 0xff61 - 0xa1
goto write1
}
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
goto write2
}
}
err = internal.ErrASCIIReplacement
break
}
write1:
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2:
j1 := uint8(r>>codeShift) & codeMask
j2 := uint8(r) & codeMask
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break loop
}
if j1 <= 61 {
dst[nDst+0] = 129 + j1/2
} else {
dst[nDst+0] = 193 + j1/2
}
if j1&1 == 0 {
dst[nDst+1] = j2 + j2/63 + 64
} else {
dst[nDst+1] = j2 + 159
}
nDst += 2
continue
}
return nDst, nSrc, err
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,177 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package korean
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{EUCKR}
// EUCKR is the EUC-KR encoding, also known as Code Page 949.
var EUCKR encoding.Encoding = &eucKR
var eucKR = internal.Encoding{
&internal.SimpleEncoding{eucKRDecoder{}, eucKREncoder{}},
"EUC-KR",
identifier.EUCKR,
}
type eucKRDecoder struct{ transform.NopResetter }
func (eucKRDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case 0x81 <= c0 && c0 < 0xff:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
break
}
c1 := src[nSrc+1]
size = 2
if c0 < 0xc7 {
r = 178 * rune(c0-0x81)
switch {
case 0x41 <= c1 && c1 < 0x5b:
r += rune(c1) - (0x41 - 0*26)
case 0x61 <= c1 && c1 < 0x7b:
r += rune(c1) - (0x61 - 1*26)
case 0x81 <= c1 && c1 < 0xff:
r += rune(c1) - (0x81 - 2*26)
default:
goto decError
}
} else if 0xa1 <= c1 && c1 < 0xff {
r = 178*(0xc7-0x81) + rune(c0-0xc7)*94 + rune(c1-0xa1)
} else {
goto decError
}
if int(r) < len(decode) {
r = rune(decode[r])
if r != 0 {
break
}
}
decError:
r = utf8.RuneError
if c1 < utf8.RuneSelf {
size = 1
}
default:
r, size = utf8.RuneError, 1
break
}
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type eucKREncoder struct{ transform.NopResetter }
func (eucKREncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2
}
case encode5Low <= r && r < encode5High:
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2
}
case encode6Low <= r && r < encode6High:
if r = rune(encode6[r-encode6Low]); r != 0 {
goto write2
}
}
err = internal.ErrASCIIReplacement
break
}
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 8)
dst[nDst+1] = uint8(r)
nDst += 2
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 7 {
panic("bad numEncodeTables")
}
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,12 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package simplifiedchinese
import (
"golang.org/x/text/encoding"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{GB18030, GBK, HZGB2312}

@ -0,0 +1,269 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package simplifiedchinese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
var (
// GB18030 is the GB18030 encoding.
GB18030 encoding.Encoding = &gbk18030
// GBK is the GBK encoding. It encodes an extension of the GB2312 character set
// and is also known as Code Page 936.
GBK encoding.Encoding = &gbk
)
var gbk = internal.Encoding{
&internal.SimpleEncoding{
gbkDecoder{gb18030: false},
gbkEncoder{gb18030: false},
},
"GBK",
identifier.GBK,
}
var gbk18030 = internal.Encoding{
&internal.SimpleEncoding{
gbkDecoder{gb18030: true},
gbkEncoder{gb18030: true},
},
"GB18030",
identifier.GB18030,
}
type gbkDecoder struct {
transform.NopResetter
gb18030 bool
}
func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
// says to treat "gbk" as Code Page 936.
case c0 == 0x80:
r, size = '€', 1
case c0 < 0xff:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
c1 := src[nSrc+1]
switch {
case 0x40 <= c1 && c1 < 0x7f:
c1 -= 0x40
case 0x80 <= c1 && c1 < 0xff:
c1 -= 0x41
case d.gb18030 && 0x30 <= c1 && c1 < 0x40:
if nSrc+3 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
// The second byte here is always ASCII, so we can set size
// to 1 in all cases.
r, size = utf8.RuneError, 1
goto write
}
c2 := src[nSrc+2]
if c2 < 0x81 || 0xff <= c2 {
r, size = utf8.RuneError, 1
goto write
}
c3 := src[nSrc+3]
if c3 < 0x30 || 0x3a <= c3 {
r, size = utf8.RuneError, 1
goto write
}
size = 4
r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30)
if r < 39420 {
i, j := 0, len(gb18030)
for i < j {
h := i + (j-i)/2
if r >= rune(gb18030[h][0]) {
i = h + 1
} else {
j = h
}
}
dec := &gb18030[i-1]
r += rune(dec[1]) - rune(dec[0])
goto write
}
r -= 189000
if 0 <= r && r < 0x100000 {
r += 0x10000
} else {
r, size = utf8.RuneError, 1
}
goto write
default:
r, size = utf8.RuneError, 1
goto write
}
r, size = '\ufffd', 2
if i := int(c0-0x81)*190 + int(c1); i < len(decode) {
r = rune(decode[i])
if r == 0 {
r = '\ufffd'
}
}
default:
r, size = utf8.RuneError, 1
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type gbkEncoder struct {
transform.NopResetter
gb18030 bool
}
func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, r2, size := rune(0), rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r2 = rune(encode0[r-encode0Low]); r2 != 0 {
goto write2
}
case encode1Low <= r && r < encode1High:
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
// says to treat "gbk" as Code Page 936.
if r == '€' {
r = 0x80
goto write1
}
if r2 = rune(encode1[r-encode1Low]); r2 != 0 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r2 = rune(encode2[r-encode2Low]); r2 != 0 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r2 = rune(encode3[r-encode3Low]); r2 != 0 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r2 = rune(encode4[r-encode4Low]); r2 != 0 {
goto write2
}
}
if e.gb18030 {
if r < 0x10000 {
i, j := 0, len(gb18030)
for i < j {
h := i + (j-i)/2
if r >= rune(gb18030[h][1]) {
i = h + 1
} else {
j = h
}
}
dec := &gb18030[i-1]
r += rune(dec[0]) - rune(dec[1])
goto write4
} else if r < 0x110000 {
r += 189000 - 0x10000
goto write4
}
}
err = internal.ErrASCIIReplacement
break
}
write1:
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r2 >> 8)
dst[nDst+1] = uint8(r2)
nDst += 2
continue
write4:
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+3] = uint8(r%10 + 0x30)
r /= 10
dst[nDst+2] = uint8(r%126 + 0x81)
r /= 126
dst[nDst+1] = uint8(r%10 + 0x30)
r /= 10
dst[nDst+0] = uint8(r + 0x81)
nDst += 4
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 5 {
panic("bad numEncodeTables")
}
}

@ -0,0 +1,245 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package simplifiedchinese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// HZGB2312 is the HZ-GB2312 encoding.
var HZGB2312 encoding.Encoding = &hzGB2312
var hzGB2312 = internal.Encoding{
internal.FuncEncoding{hzGB2312NewDecoder, hzGB2312NewEncoder},
"HZ-GB2312",
identifier.HZGB2312,
}
func hzGB2312NewDecoder() transform.Transformer {
return new(hzGB2312Decoder)
}
func hzGB2312NewEncoder() transform.Transformer {
return new(hzGB2312Encoder)
}
const (
asciiState = iota
gbState
)
type hzGB2312Decoder int
func (d *hzGB2312Decoder) Reset() {
*d = asciiState
}
func (d *hzGB2312Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
c0 := src[nSrc]
if c0 >= utf8.RuneSelf {
r, size = utf8.RuneError, 1
goto write
}
if c0 == '~' {
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
size = 2
switch src[nSrc+1] {
case '{':
*d = gbState
continue
case '}':
*d = asciiState
continue
case '~':
if nDst >= len(dst) {
err = transform.ErrShortDst
break loop
}
dst[nDst] = '~'
nDst++
continue
case '\n':
continue
default:
r = utf8.RuneError
goto write
}
}
if *d == asciiState {
r, size = rune(c0), 1
} else {
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
size = 2
c1 := src[nSrc+1]
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
// error
} else if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
r = rune(decode[i])
if r != 0 {
goto write
}
}
if c1 > utf8.RuneSelf {
// Be consistent and always treat non-ASCII as a single error.
size = 1
}
r = utf8.RuneError
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type hzGB2312Encoder int
func (d *hzGB2312Encoder) Reset() {
*d = asciiState
}
func (e *hzGB2312Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
if r == '~' {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = '~'
dst[nDst+1] = '~'
nDst += 2
continue
} else if *e != asciiState {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
*e = asciiState
dst[nDst+0] = '~'
dst[nDst+1] = '}'
nDst += 2
} else if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst += 1
continue
}
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto writeGB
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto writeGB
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto writeGB
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto writeGB
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto writeGB
}
}
terminateInASCIIState:
// Switch back to ASCII state in case of error so that an ASCII
// replacement character can be written in the correct state.
if *e != asciiState {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = '~'
dst[nDst+1] = '}'
nDst += 2
}
err = internal.ErrASCIIReplacement
break
writeGB:
c0 := uint8(r>>8) - 0x80
c1 := uint8(r) - 0x80
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
goto terminateInASCIIState
}
if *e == asciiState {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
*e = gbState
dst[nDst+0] = '~'
dst[nDst+1] = '{'
nDst += 2
} else if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = c0
dst[nDst+1] = c1
nDst += 2
continue
}
// TODO: should one always terminate in ASCII state to make it safe to
// concatenate two HZ-GB2312-encoded strings?
return nDst, nSrc, err
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,199 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package traditionalchinese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{Big5}
// Big5 is the Big5 encoding, also known as Code Page 950.
var Big5 encoding.Encoding = &big5
var big5 = internal.Encoding{
&internal.SimpleEncoding{big5Decoder{}, big5Encoder{}},
"Big5",
identifier.Big5,
}
type big5Decoder struct{ transform.NopResetter }
func (big5Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size, s := rune(0), 0, ""
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case 0x81 <= c0 && c0 < 0xff:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
c1 := src[nSrc+1]
switch {
case 0x40 <= c1 && c1 < 0x7f:
c1 -= 0x40
case 0xa1 <= c1 && c1 < 0xff:
c1 -= 0x62
case c1 < 0x40:
r, size = utf8.RuneError, 1
goto write
default:
r, size = utf8.RuneError, 2
goto write
}
r, size = '\ufffd', 2
if i := int(c0-0x81)*157 + int(c1); i < len(decode) {
if 1133 <= i && i < 1167 {
// The two-rune special cases for LATIN CAPITAL / SMALL E WITH CIRCUMFLEX
// AND MACRON / CARON are from http://encoding.spec.whatwg.org/#big5
switch i {
case 1133:
s = "\u00CA\u0304"
goto writeStr
case 1135:
s = "\u00CA\u030C"
goto writeStr
case 1164:
s = "\u00EA\u0304"
goto writeStr
case 1166:
s = "\u00EA\u030C"
goto writeStr
}
}
r = rune(decode[i])
if r == 0 {
r = '\ufffd'
}
}
default:
r, size = utf8.RuneError, 1
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
continue loop
writeStr:
if nDst+len(s) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += copy(dst[nDst:], s)
continue loop
}
return nDst, nSrc, err
}
type big5Encoder struct{ transform.NopResetter }
func (big5Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
}
if r >= utf8.RuneSelf {
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2
}
case encode5Low <= r && r < encode5High:
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2
}
case encode6Low <= r && r < encode6High:
if r = rune(encode6[r-encode6Low]); r != 0 {
goto write2
}
case encode7Low <= r && r < encode7High:
if r = rune(encode7[r-encode7Low]); r != 0 {
goto write2
}
}
err = internal.ErrASCIIReplacement
break
}
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 8)
dst[nDst+1] = uint8(r)
nDst += 2
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 8 {
panic("bad numEncodeTables")
}
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,82 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package unicode
import (
"golang.org/x/text/transform"
)
// BOMOverride returns a new decoder transformer that is identical to fallback,
// except that the presence of a Byte Order Mark at the start of the input
// causes it to switch to the corresponding Unicode decoding. It will only
// consider BOMs for UTF-8, UTF-16BE, and UTF-16LE.
//
// This differs from using ExpectBOM by allowing a BOM to switch to UTF-8, not
// just UTF-16 variants, and allowing falling back to any encoding scheme.
//
// This technique is recommended by the W3C for use in HTML 5: "For
// compatibility with deployed content, the byte order mark (also known as BOM)
// is considered more authoritative than anything else."
// http://www.w3.org/TR/encoding/#specification-hooks
//
// Using BOMOverride is mostly intended for use cases where the first characters
// of a fallback encoding are known to not be a BOM, for example, for valid HTML
// and most encodings.
func BOMOverride(fallback transform.Transformer) transform.Transformer {
// TODO: possibly allow a variadic argument of unicode encodings to allow
// specifying details of which fallbacks are supported as well as
// specifying the details of the implementations. This would also allow for
// support for UTF-32, which should not be supported by default.
return &bomOverride{fallback: fallback}
}
type bomOverride struct {
fallback transform.Transformer
current transform.Transformer
}
func (d *bomOverride) Reset() {
d.current = nil
d.fallback.Reset()
}
var (
// TODO: we could use decode functions here, instead of allocating a new
// decoder on every NewDecoder as IgnoreBOM decoders can be stateless.
utf16le = UTF16(LittleEndian, IgnoreBOM)
utf16be = UTF16(BigEndian, IgnoreBOM)
)
const utf8BOM = "\ufeff"
func (d *bomOverride) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if d.current != nil {
return d.current.Transform(dst, src, atEOF)
}
if len(src) < 3 && !atEOF {
return 0, 0, transform.ErrShortSrc
}
d.current = d.fallback
bomSize := 0
if len(src) >= 2 {
if src[0] == 0xFF && src[1] == 0xFE {
d.current = utf16le.NewDecoder()
bomSize = 2
} else if src[0] == 0xFE && src[1] == 0xFF {
d.current = utf16be.NewDecoder()
bomSize = 2
} else if len(src) >= 3 &&
src[0] == utf8BOM[0] &&
src[1] == utf8BOM[1] &&
src[2] == utf8BOM[2] {
d.current = transform.Nop
bomSize = 3
}
}
if bomSize < len(src) {
nDst, nSrc, err = d.current.Transform(dst, src[bomSize:], atEOF)
}
return nDst, nSrc + bomSize, err
}

@ -0,0 +1,512 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package unicode provides Unicode encodings such as UTF-16.
package unicode // import "golang.org/x/text/encoding/unicode"
import (
"bytes"
"errors"
"unicode/utf16"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/internal/utf8internal"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
)
// TODO: I think the Transformers really should return errors on unmatched
// surrogate pairs and odd numbers of bytes. This is not required by RFC 2781,
// which leaves it open, but is suggested by WhatWG. It will allow for all error
// modes as defined by WhatWG: fatal, HTML and Replacement. This would require
// the introduction of some kind of error type for conveying the erroneous code
// point.
// UTF8 is the UTF-8 encoding. It neither removes nor adds byte order marks.
var UTF8 encoding.Encoding = utf8enc
// UTF8BOM is an UTF-8 encoding where the decoder strips a leading byte order
// mark while the encoder adds one.
//
// Some editors add a byte order mark as a signature to UTF-8 files. Although
// the byte order mark is not useful for detecting byte order in UTF-8, it is
// sometimes used as a convention to mark UTF-8-encoded files. This relies on
// the observation that the UTF-8 byte order mark is either an illegal or at
// least very unlikely sequence in any other character encoding.
var UTF8BOM encoding.Encoding = utf8bomEncoding{}
type utf8bomEncoding struct{}
func (utf8bomEncoding) String() string {
return "UTF-8-BOM"
}
func (utf8bomEncoding) ID() (identifier.MIB, string) {
return identifier.Unofficial, "x-utf8bom"
}
func (utf8bomEncoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{
Transformer: &utf8bomEncoder{t: runes.ReplaceIllFormed()},
}
}
func (utf8bomEncoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: &utf8bomDecoder{}}
}
var utf8enc = &internal.Encoding{
&internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()},
"UTF-8",
identifier.UTF8,
}
type utf8bomDecoder struct {
checked bool
}
func (t *utf8bomDecoder) Reset() {
t.checked = false
}
func (t *utf8bomDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if !t.checked {
if !atEOF && len(src) < len(utf8BOM) {
if len(src) == 0 {
return 0, 0, nil
}
return 0, 0, transform.ErrShortSrc
}
if bytes.HasPrefix(src, []byte(utf8BOM)) {
nSrc += len(utf8BOM)
src = src[len(utf8BOM):]
}
t.checked = true
}
nDst, n, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
nSrc += n
return nDst, nSrc, err
}
type utf8bomEncoder struct {
written bool
t transform.Transformer
}
func (t *utf8bomEncoder) Reset() {
t.written = false
t.t.Reset()
}
func (t *utf8bomEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if !t.written {
if len(dst) < len(utf8BOM) {
return nDst, 0, transform.ErrShortDst
}
nDst = copy(dst, utf8BOM)
t.written = true
}
n, nSrc, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
nDst += n
return nDst, nSrc, err
}
type utf8Decoder struct{ transform.NopResetter }
func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
var pSrc int // point from which to start copy in src
var accept utf8internal.AcceptRange
// The decoder can only make the input larger, not smaller.
n := len(src)
if len(dst) < n {
err = transform.ErrShortDst
n = len(dst)
atEOF = false
}
for nSrc < n {
c := src[nSrc]
if c < utf8.RuneSelf {
nSrc++
continue
}
first := utf8internal.First[c]
size := int(first & utf8internal.SizeMask)
if first == utf8internal.FirstInvalid {
goto handleInvalid // invalid starter byte
}
accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift]
if nSrc+size > n {
if !atEOF {
// We may stop earlier than necessary here if the short sequence
// has invalid bytes. Not checking for this simplifies the code
// and may avoid duplicate computations in certain conditions.
if err == nil {
err = transform.ErrShortSrc
}
break
}
// Determine the maximal subpart of an ill-formed subsequence.
switch {
case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]:
size = 1
case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]:
size = 2
default:
size = 3 // As we are short, the maximum is 3.
}
goto handleInvalid
}
if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c {
size = 1
goto handleInvalid // invalid continuation byte
} else if size == 2 {
} else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c {
size = 2
goto handleInvalid // invalid continuation byte
} else if size == 3 {
} else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c {
size = 3
goto handleInvalid // invalid continuation byte
}
nSrc += size
continue
handleInvalid:
// Copy the scanned input so far.
nDst += copy(dst[nDst:], src[pSrc:nSrc])
// Append RuneError to the destination.
const runeError = "\ufffd"
if nDst+len(runeError) > len(dst) {
return nDst, nSrc, transform.ErrShortDst
}
nDst += copy(dst[nDst:], runeError)
// Skip the maximal subpart of an ill-formed subsequence according to
// the W3C standard way instead of the Go way. This Transform is
// probably the only place in the text repo where it is warranted.
nSrc += size
pSrc = nSrc
// Recompute the maximum source length.
if sz := len(dst) - nDst; sz < len(src)-nSrc {
err = transform.ErrShortDst
n = nSrc + sz
atEOF = false
}
}
return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err
}
// UTF16 returns a UTF-16 Encoding for the given default endianness and byte
// order mark (BOM) policy.
//
// When decoding from UTF-16 to UTF-8, if the BOMPolicy is IgnoreBOM then
// neither BOMs U+FEFF nor noncharacters U+FFFE in the input stream will affect
// the endianness used for decoding, and will instead be output as their
// standard UTF-8 encodings: "\xef\xbb\xbf" and "\xef\xbf\xbe". If the BOMPolicy
// is UseBOM or ExpectBOM a staring BOM is not written to the UTF-8 output.
// Instead, it overrides the default endianness e for the remainder of the
// transformation. Any subsequent BOMs U+FEFF or noncharacters U+FFFE will not
// affect the endianness used, and will instead be output as their standard
// UTF-8 encodings. For UseBOM, if there is no starting BOM, it will proceed
// with the default Endianness. For ExpectBOM, in that case, the transformation
// will return early with an ErrMissingBOM error.
//
// When encoding from UTF-8 to UTF-16, a BOM will be inserted at the start of
// the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM will not
// be inserted. The UTF-8 input does not need to contain a BOM.
//
// There is no concept of a 'native' endianness. If the UTF-16 data is produced
// and consumed in a greater context that implies a certain endianness, use
// IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM.
//
// In the language of https://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM
// corresponds to "Where the precise type of the data stream is known... the
// BOM should not be used" and ExpectBOM corresponds to "A particular
// protocol... may require use of the BOM".
func UTF16(e Endianness, b BOMPolicy) encoding.Encoding {
return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]}
}
// mibValue maps Endianness and BOMPolicy settings to MIB constants. Note that
// some configurations map to the same MIB identifier. RFC 2781 has requirements
// and recommendations. Some of the "configurations" are merely recommendations,
// so multiple configurations could match.
var mibValue = map[Endianness][numBOMValues]identifier.MIB{
BigEndian: [numBOMValues]identifier.MIB{
IgnoreBOM: identifier.UTF16BE,
UseBOM: identifier.UTF16, // BigEnding default is preferred by RFC 2781.
// TODO: acceptBOM | strictBOM would map to UTF16BE as well.
},
LittleEndian: [numBOMValues]identifier.MIB{
IgnoreBOM: identifier.UTF16LE,
UseBOM: identifier.UTF16, // LittleEndian default is allowed and preferred on Windows.
// TODO: acceptBOM | strictBOM would map to UTF16LE as well.
},
// ExpectBOM is not widely used and has no valid MIB identifier.
}
// All lists a configuration for each IANA-defined UTF-16 variant.
var All = []encoding.Encoding{
UTF8,
UTF16(BigEndian, UseBOM),
UTF16(BigEndian, IgnoreBOM),
UTF16(LittleEndian, IgnoreBOM),
}
// BOMPolicy is a UTF-16 encoding's byte order mark policy.
type BOMPolicy uint8
const (
writeBOM BOMPolicy = 0x01
acceptBOM BOMPolicy = 0x02
requireBOM BOMPolicy = 0x04
bomMask BOMPolicy = 0x07
// HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
// map of an array of length 8 of a type that is also used as a key or value
// in another map). See golang.org/issue/11354.
// TODO: consider changing this value back to 8 if the use of 1.4.* has
// been minimized.
numBOMValues = 8 + 1
// IgnoreBOM means to ignore any byte order marks.
IgnoreBOM BOMPolicy = 0
// Common and RFC 2781-compliant interpretation for UTF-16BE/LE.
// UseBOM means that the UTF-16 form may start with a byte order mark, which
// will be used to override the default encoding.
UseBOM BOMPolicy = writeBOM | acceptBOM
// Common and RFC 2781-compliant interpretation for UTF-16.
// ExpectBOM means that the UTF-16 form must start with a byte order mark,
// which will be used to override the default encoding.
ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
// Used in Java as Unicode (not to be confused with Java's UTF-16) and
// ICU's UTF-16,version=1. Not compliant with RFC 2781.
// TODO (maybe): strictBOM: BOM must match Endianness. This would allow:
// - UTF-16(B|L)E,version=1: writeBOM | acceptBOM | requireBOM | strictBOM
// (UnicodeBig and UnicodeLittle in Java)
// - RFC 2781-compliant, but less common interpretation for UTF-16(B|L)E:
// acceptBOM | strictBOM (e.g. assigned to CheckBOM).
// This addition would be consistent with supporting ExpectBOM.
)
// Endianness is a UTF-16 encoding's default endianness.
type Endianness bool
const (
// BigEndian is UTF-16BE.
BigEndian Endianness = false
// LittleEndian is UTF-16LE.
LittleEndian Endianness = true
)
// ErrMissingBOM means that decoding UTF-16 input with ExpectBOM did not find a
// starting byte order mark.
var ErrMissingBOM = errors.New("encoding: missing byte order mark")
type utf16Encoding struct {
config
mib identifier.MIB
}
type config struct {
endianness Endianness
bomPolicy BOMPolicy
}
func (u utf16Encoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: &utf16Decoder{
initial: u.config,
current: u.config,
}}
}
func (u utf16Encoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: &utf16Encoder{
endianness: u.endianness,
initialBOMPolicy: u.bomPolicy,
currentBOMPolicy: u.bomPolicy,
}}
}
func (u utf16Encoding) ID() (mib identifier.MIB, other string) {
return u.mib, ""
}
func (u utf16Encoding) String() string {
e, b := "B", ""
if u.endianness == LittleEndian {
e = "L"
}
switch u.bomPolicy {
case ExpectBOM:
b = "Expect"
case UseBOM:
b = "Use"
case IgnoreBOM:
b = "Ignore"
}
return "UTF-16" + e + "E (" + b + " BOM)"
}
type utf16Decoder struct {
initial config
current config
}
func (u *utf16Decoder) Reset() {
u.current = u.initial
}
func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if len(src) < 2 && atEOF && u.current.bomPolicy&requireBOM != 0 {
return 0, 0, ErrMissingBOM
}
if len(src) == 0 {
return 0, 0, nil
}
if len(src) >= 2 && u.current.bomPolicy&acceptBOM != 0 {
switch {
case src[0] == 0xfe && src[1] == 0xff:
u.current.endianness = BigEndian
nSrc = 2
case src[0] == 0xff && src[1] == 0xfe:
u.current.endianness = LittleEndian
nSrc = 2
default:
if u.current.bomPolicy&requireBOM != 0 {
return 0, 0, ErrMissingBOM
}
}
u.current.bomPolicy = IgnoreBOM
}
var r rune
var dSize, sSize int
for nSrc < len(src) {
if nSrc+1 < len(src) {
x := uint16(src[nSrc+0])<<8 | uint16(src[nSrc+1])
if u.current.endianness == LittleEndian {
x = x>>8 | x<<8
}
r, sSize = rune(x), 2
if utf16.IsSurrogate(r) {
if nSrc+3 < len(src) {
x = uint16(src[nSrc+2])<<8 | uint16(src[nSrc+3])
if u.current.endianness == LittleEndian {
x = x>>8 | x<<8
}
// Save for next iteration if it is not a high surrogate.
if isHighSurrogate(rune(x)) {
r, sSize = utf16.DecodeRune(r, rune(x)), 4
}
} else if !atEOF {
err = transform.ErrShortSrc
break
}
}
if dSize = utf8.RuneLen(r); dSize < 0 {
r, dSize = utf8.RuneError, 3
}
} else if atEOF {
// Single trailing byte.
r, dSize, sSize = utf8.RuneError, 3, 1
} else {
err = transform.ErrShortSrc
break
}
if nDst+dSize > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
nSrc += sSize
}
return nDst, nSrc, err
}
func isHighSurrogate(r rune) bool {
return 0xDC00 <= r && r <= 0xDFFF
}
type utf16Encoder struct {
endianness Endianness
initialBOMPolicy BOMPolicy
currentBOMPolicy BOMPolicy
}
func (u *utf16Encoder) Reset() {
u.currentBOMPolicy = u.initialBOMPolicy
}
func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if u.currentBOMPolicy&writeBOM != 0 {
if len(dst) < 2 {
return 0, 0, transform.ErrShortDst
}
dst[0], dst[1] = 0xfe, 0xff
u.currentBOMPolicy = IgnoreBOM
nDst = 2
}
r, size := rune(0), 0
for nSrc < len(src) {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
}
if r <= 0xffff {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 8)
dst[nDst+1] = uint8(r)
nDst += 2
} else {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
r1, r2 := utf16.EncodeRune(r)
dst[nDst+0] = uint8(r1 >> 8)
dst[nDst+1] = uint8(r1)
dst[nDst+2] = uint8(r2 >> 8)
dst[nDst+3] = uint8(r2)
nDst += 4
}
nSrc += size
}
if u.endianness == LittleEndian {
for i := 0; i < nDst; i += 2 {
dst[i], dst[i+1] = dst[i+1], dst[i]
}
}
return nDst, nSrc, err
}

@ -0,0 +1,87 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package utf8internal contains low-level utf8-related constants, tables, etc.
// that are used internally by the text package.
package utf8internal
// The default lowest and highest continuation byte.
const (
LoCB = 0x80 // 1000 0000
HiCB = 0xBF // 1011 1111
)
// Constants related to getting information of first bytes of UTF-8 sequences.
const (
// ASCII identifies a UTF-8 byte as ASCII.
ASCII = as
// FirstInvalid indicates a byte is invalid as a first byte of a UTF-8
// sequence.
FirstInvalid = xx
// SizeMask is a mask for the size bits. Use use x&SizeMask to get the size.
SizeMask = 7
// AcceptShift is the right-shift count for the first byte info byte to get
// the index into the AcceptRanges table. See AcceptRanges.
AcceptShift = 4
// The names of these constants are chosen to give nice alignment in the
// table below. The first nibble is an index into acceptRanges or F for
// special one-byte cases. The second nibble is the Rune length or the
// Status for the special one-byte case.
xx = 0xF1 // invalid: size 1
as = 0xF0 // ASCII: size 1
s1 = 0x02 // accept 0, size 2
s2 = 0x13 // accept 1, size 3
s3 = 0x03 // accept 0, size 3
s4 = 0x23 // accept 2, size 3
s5 = 0x34 // accept 3, size 4
s6 = 0x04 // accept 0, size 4
s7 = 0x44 // accept 4, size 4
)
// First is information about the first byte in a UTF-8 sequence.
var First = [256]uint8{
// 1 2 3 4 5 6 7 8 9 A B C D E F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
// 1 2 3 4 5 6 7 8 9 A B C D E F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
}
// AcceptRange gives the range of valid values for the second byte in a UTF-8
// sequence for any value for First that is not ASCII or FirstInvalid.
type AcceptRange struct {
Lo uint8 // lowest value for second byte.
Hi uint8 // highest value for second byte.
}
// AcceptRanges is a slice of AcceptRange values. For a given byte sequence b
//
// AcceptRanges[First[b[0]]>>AcceptShift]
//
// will give the value of AcceptRange for the multi-byte UTF-8 sequence starting
// at b[0].
var AcceptRanges = [...]AcceptRange{
0: {LoCB, HiCB},
1: {0xA0, HiCB},
2: {LoCB, 0x9F},
3: {0x90, HiCB},
4: {LoCB, 0x8F},
}

@ -0,0 +1,187 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package runes
import (
"unicode/utf8"
"golang.org/x/text/transform"
)
// Note: below we pass invalid UTF-8 to the tIn and tNotIn transformers as is.
// This is done for various reasons:
// - To retain the semantics of the Nop transformer: if input is passed to a Nop
// one would expect it to be unchanged.
// - It would be very expensive to pass a converted RuneError to a transformer:
// a transformer might need more source bytes after RuneError, meaning that
// the only way to pass it safely is to create a new buffer and manage the
// intermingling of RuneErrors and normal input.
// - Many transformers leave ill-formed UTF-8 as is, so this is not
// inconsistent. Generally ill-formed UTF-8 is only replaced if it is a
// logical consequence of the operation (as for Map) or if it otherwise would
// pose security concerns (as for Remove).
// - An alternative would be to return an error on ill-formed UTF-8, but this
// would be inconsistent with other operations.
// If returns a transformer that applies tIn to consecutive runes for which
// s.Contains(r) and tNotIn to consecutive runes for which !s.Contains(r). Reset
// is called on tIn and tNotIn at the start of each run. A Nop transformer will
// substitute a nil value passed to tIn or tNotIn. Invalid UTF-8 is translated
// to RuneError to determine which transformer to apply, but is passed as is to
// the respective transformer.
func If(s Set, tIn, tNotIn transform.Transformer) Transformer {
if tIn == nil && tNotIn == nil {
return Transformer{transform.Nop}
}
if tIn == nil {
tIn = transform.Nop
}
if tNotIn == nil {
tNotIn = transform.Nop
}
sIn, ok := tIn.(transform.SpanningTransformer)
if !ok {
sIn = dummySpan{tIn}
}
sNotIn, ok := tNotIn.(transform.SpanningTransformer)
if !ok {
sNotIn = dummySpan{tNotIn}
}
a := &cond{
tIn: sIn,
tNotIn: sNotIn,
f: s.Contains,
}
a.Reset()
return Transformer{a}
}
type dummySpan struct{ transform.Transformer }
func (d dummySpan) Span(src []byte, atEOF bool) (n int, err error) {
return 0, transform.ErrEndOfSpan
}
type cond struct {
tIn, tNotIn transform.SpanningTransformer
f func(rune) bool
check func(rune) bool // current check to perform
t transform.SpanningTransformer // current transformer to use
}
// Reset implements transform.Transformer.
func (t *cond) Reset() {
t.check = t.is
t.t = t.tIn
t.t.Reset() // notIn will be reset on first usage.
}
func (t *cond) is(r rune) bool {
if t.f(r) {
return true
}
t.check = t.isNot
t.t = t.tNotIn
t.tNotIn.Reset()
return false
}
func (t *cond) isNot(r rune) bool {
if !t.f(r) {
return true
}
t.check = t.is
t.t = t.tIn
t.tIn.Reset()
return false
}
// This implementation of Span doesn't help all too much, but it needs to be
// there to satisfy this package's Transformer interface.
// TODO: there are certainly room for improvements, though. For example, if
// t.t == transform.Nop (which will a common occurrence) it will save a bundle
// to special-case that loop.
func (t *cond) Span(src []byte, atEOF bool) (n int, err error) {
p := 0
for n < len(src) && err == nil {
// Don't process too much at a time as the Spanner that will be
// called on this block may terminate early.
const maxChunk = 4096
max := len(src)
if v := n + maxChunk; v < max {
max = v
}
atEnd := false
size := 0
current := t.t
for ; p < max; p += size {
r := rune(src[p])
if r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[p:]); size == 1 {
if !atEOF && !utf8.FullRune(src[p:]) {
err = transform.ErrShortSrc
break
}
}
if !t.check(r) {
// The next rune will be the start of a new run.
atEnd = true
break
}
}
n2, err2 := current.Span(src[n:p], atEnd || (atEOF && p == len(src)))
n += n2
if err2 != nil {
return n, err2
}
// At this point either err != nil or t.check will pass for the rune at p.
p = n + size
}
return n, err
}
func (t *cond) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
p := 0
for nSrc < len(src) && err == nil {
// Don't process too much at a time, as the work might be wasted if the
// destination buffer isn't large enough to hold the result or a
// transform returns an error early.
const maxChunk = 4096
max := len(src)
if n := nSrc + maxChunk; n < len(src) {
max = n
}
atEnd := false
size := 0
current := t.t
for ; p < max; p += size {
r := rune(src[p])
if r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[p:]); size == 1 {
if !atEOF && !utf8.FullRune(src[p:]) {
err = transform.ErrShortSrc
break
}
}
if !t.check(r) {
// The next rune will be the start of a new run.
atEnd = true
break
}
}
nDst2, nSrc2, err2 := current.Transform(dst[nDst:], src[nSrc:p], atEnd || (atEOF && p == len(src)))
nDst += nDst2
nSrc += nSrc2
if err2 != nil {
return nDst, nSrc, err2
}
// At this point either err != nil or t.check will pass for the rune at p.
p = nSrc + size
}
return nDst, nSrc, err
}

@ -0,0 +1,355 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package runes provide transforms for UTF-8 encoded text.
package runes // import "golang.org/x/text/runes"
import (
"unicode"
"unicode/utf8"
"golang.org/x/text/transform"
)
// A Set is a collection of runes.
type Set interface {
// Contains returns true if r is contained in the set.
Contains(r rune) bool
}
type setFunc func(rune) bool
func (s setFunc) Contains(r rune) bool {
return s(r)
}
// Note: using funcs here instead of wrapping types result in cleaner
// documentation and a smaller API.
// In creates a Set with a Contains method that returns true for all runes in
// the given RangeTable.
func In(rt *unicode.RangeTable) Set {
return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
}
// In creates a Set with a Contains method that returns true for all runes not
// in the given RangeTable.
func NotIn(rt *unicode.RangeTable) Set {
return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
}
// Predicate creates a Set with a Contains method that returns f(r).
func Predicate(f func(rune) bool) Set {
return setFunc(f)
}
// Transformer implements the transform.Transformer interface.
type Transformer struct {
t transform.SpanningTransformer
}
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
return t.t.Transform(dst, src, atEOF)
}
func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
return t.t.Span(b, atEOF)
}
func (t Transformer) Reset() { t.t.Reset() }
// Bytes returns a new byte slice with the result of converting b using t. It
// calls Reset on t. It returns nil if any error was found. This can only happen
// if an error-producing Transformer is passed to If.
func (t Transformer) Bytes(b []byte) []byte {
b, _, err := transform.Bytes(t, b)
if err != nil {
return nil
}
return b
}
// String returns a string with the result of converting s using t. It calls
// Reset on t. It returns the empty string if any error was found. This can only
// happen if an error-producing Transformer is passed to If.
func (t Transformer) String(s string) string {
s, _, err := transform.String(t, s)
if err != nil {
return ""
}
return s
}
// TODO:
// - Copy: copying strings and bytes in whole-rune units.
// - Validation (maybe)
// - Well-formed-ness (maybe)
const runeErrorString = string(utf8.RuneError)
// Remove returns a Transformer that removes runes r for which s.Contains(r).
// Illegal input bytes are replaced by RuneError before being passed to f.
func Remove(s Set) Transformer {
if f, ok := s.(setFunc); ok {
// This little trick cuts the running time of BenchmarkRemove for sets
// created by Predicate roughly in half.
// TODO: special-case RangeTables as well.
return Transformer{remove(f)}
}
return Transformer{remove(s.Contains)}
}
// TODO: remove transform.RemoveFunc.
type remove func(r rune) bool
func (remove) Reset() {}
// Span implements transform.Spanner.
func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
for r, size := rune(0), 0; n < len(src); {
if r = rune(src[n]); r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[n:]) {
err = transform.ErrShortSrc
} else {
err = transform.ErrEndOfSpan
}
break
}
if t(r) {
err = transform.ErrEndOfSpan
break
}
n += size
}
return
}
// Transform implements transform.Transformer.
func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for r, size := rune(0), 0; nSrc < len(src); {
if r = rune(src[nSrc]); r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
// We replace illegal bytes with RuneError. Not doing so might
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
// The resulting byte sequence may subsequently contain runes
// for which t(r) is true that were passed unnoticed.
if !t(utf8.RuneError) {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = runeErrorString[0]
dst[nDst+1] = runeErrorString[1]
dst[nDst+2] = runeErrorString[2]
nDst += 3
}
nSrc++
continue
}
if t(r) {
nSrc += size
continue
}
if nDst+size > len(dst) {
err = transform.ErrShortDst
break
}
for i := 0; i < size; i++ {
dst[nDst] = src[nSrc]
nDst++
nSrc++
}
}
return
}
// Map returns a Transformer that maps the runes in the input using the given
// mapping. Illegal bytes in the input are converted to utf8.RuneError before
// being passed to the mapping func.
func Map(mapping func(rune) rune) Transformer {
return Transformer{mapper(mapping)}
}
type mapper func(rune) rune
func (mapper) Reset() {}
// Span implements transform.Spanner.
func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
for r, size := rune(0), 0; n < len(src); n += size {
if r = rune(src[n]); r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[n:]) {
err = transform.ErrShortSrc
} else {
err = transform.ErrEndOfSpan
}
break
}
if t(r) != r {
err = transform.ErrEndOfSpan
break
}
}
return n, err
}
// Transform implements transform.Transformer.
func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
var replacement rune
var b [utf8.UTFMax]byte
for r, size := rune(0), 0; nSrc < len(src); {
if r = rune(src[nSrc]); r < utf8.RuneSelf {
if replacement = t(r); replacement < utf8.RuneSelf {
if nDst == len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = byte(replacement)
nDst++
nSrc++
continue
}
size = 1
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = runeErrorString[0]
dst[nDst+1] = runeErrorString[1]
dst[nDst+2] = runeErrorString[2]
nDst += 3
nSrc++
continue
}
} else if replacement = t(r); replacement == r {
if nDst+size > len(dst) {
err = transform.ErrShortDst
break
}
for i := 0; i < size; i++ {
dst[nDst] = src[nSrc]
nDst++
nSrc++
}
continue
}
n := utf8.EncodeRune(b[:], replacement)
if nDst+n > len(dst) {
err = transform.ErrShortDst
break
}
for i := 0; i < n; i++ {
dst[nDst] = b[i]
nDst++
}
nSrc += size
}
return
}
// ReplaceIllFormed returns a transformer that replaces all input bytes that are
// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
func ReplaceIllFormed() Transformer {
return Transformer{&replaceIllFormed{}}
}
type replaceIllFormed struct{ transform.NopResetter }
func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
for n < len(src) {
// ASCII fast path.
if src[n] < utf8.RuneSelf {
n++
continue
}
r, size := utf8.DecodeRune(src[n:])
// Look for a valid non-ASCII rune.
if r != utf8.RuneError || size != 1 {
n += size
continue
}
// Look for short source data.
if !atEOF && !utf8.FullRune(src[n:]) {
err = transform.ErrShortSrc
break
}
// We have an invalid rune.
err = transform.ErrEndOfSpan
break
}
return n, err
}
func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for nSrc < len(src) {
// ASCII fast path.
if r := src[nSrc]; r < utf8.RuneSelf {
if nDst == len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = r
nDst++
nSrc++
continue
}
// Look for a valid non-ASCII rune.
if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
err = transform.ErrShortDst
break
}
nDst += size
nSrc += size
continue
}
// Look for short source data.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
// We have an invalid rune.
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = runeErrorString[0]
dst[nDst+1] = runeErrorString[1]
dst[nDst+2] = runeErrorString[2]
nDst += 3
nSrc++
}
return nDst, nSrc, err
}

12
vendor/modules.txt vendored

@ -1245,6 +1245,18 @@ golang.org/x/sys/windows/svc/eventlog
golang.org/x/term
# golang.org/x/text v0.3.7
## explicit; go 1.17
golang.org/x/text/encoding
golang.org/x/text/encoding/charmap
golang.org/x/text/encoding/ianaindex
golang.org/x/text/encoding/internal
golang.org/x/text/encoding/internal/identifier
golang.org/x/text/encoding/japanese
golang.org/x/text/encoding/korean
golang.org/x/text/encoding/simplifiedchinese
golang.org/x/text/encoding/traditionalchinese
golang.org/x/text/encoding/unicode
golang.org/x/text/internal/utf8internal
golang.org/x/text/runes
golang.org/x/text/secure/bidirule
golang.org/x/text/transform
golang.org/x/text/unicode/bidi

Loading…
Cancel
Save