dt_automate/vendor/github.com/gorilla/i18n/linebreak/linebreak.go
2025-02-19 18:30:19 +08:00

301 lines
11 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2013 The Gorilla Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// gorilla/i18n/linebreak implements the Unicode line breaking algorithm.
//
// Line breaking, also known as word wrapping, is the process of breaking a
// section of text into lines such that it will fit in the available width
// of a page, window or other display area.
//
// As simple as it sounds, this is not a trivial task when support for
// multilingual texts is required. The particular algorithm used in this
// package is based on best practices defined in UAX #14:
//
// http://www.unicode.org/reports/tr14/
//
// A similar package that served as inspiration for this one is Bram Stein's
// Unicode Tokenizer (for Node.js):
//
// https://github.com/bramstein/unicode-tokenizer
package linebreak
import (
"io"
"unicode"
)
type BreakAction int
// Line breaking actions.
const (
// A line break opportunity exists between two adjacent characters of the
// given line breaking classes.
BreakDirect BreakAction = iota
// A line break opportunity exists between two characters of the given
// line breaking classes only if they are separated by one or more spaces.
BreakIndirect
BreakCombiningIndirect
BreakCombiningProhibited
// No line break opportunity exists between two characters of the given
// line breaking classes, even if they are separated by one or more space
// characters.
BreakProhibited
// A line must break following a character that has the mandatory break
// property.
BreakMandatory
)
// Pair table shortcuts.
const (
di = BreakDirect
in = BreakIndirect
ci = BreakCombiningIndirect
cp = BreakCombiningProhibited
pr = BreakProhibited
ex = BreakMandatory
)
// Pair table stores line breaking actions for adjacent line breaking classes.
//
// PairTable[beforeClass][afterClass] = BreakAction
//
// Note: To determine a break it is generally not sufficient to just test the
// two adjacent characters. In any case, a custom table allows some degree of
// result tailoring.
type PairTable [][]BreakAction
// Action returns the line breaking action for the given class pair.
func (t PairTable) Action(before, after BreakClass) BreakAction {
if int(before) < len(t) && int(after) < len(t[before]) {
return t[before][after]
}
return BreakProhibited
}
// pairTable is the example PairTable defined in UAX #14:
//
// http://www.unicode.org/reports/tr14/#Table2
var pairTable = PairTable{
// after:
// OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI // before:
{pr, pr, pr, pr, pr, pr, pr, pr, pr, pr, pr, pr, pr, pr, pr, pr, pr, pr, pr, pr, pr, cp, pr, pr, pr, pr, pr, pr, pr}, // OP
{di, pr, pr, in, in, pr, pr, pr, pr, in, in, di, di, di, di, di, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // CL
{di, pr, pr, in, in, pr, pr, pr, pr, in, in, in, in, in, di, di, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // CP
{pr, pr, pr, in, in, in, pr, pr, pr, in, in, in, in, in, in, in, in, in, in, in, pr, ci, pr, in, in, in, in, in, in}, // QU
{in, pr, pr, in, in, in, pr, pr, pr, in, in, in, in, in, in, in, in, in, in, in, pr, ci, pr, in, in, in, in, in, in}, // GL
{di, pr, pr, in, in, in, pr, pr, pr, di, di, di, di, di, di, di, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // NS
{di, pr, pr, in, in, in, pr, pr, pr, di, di, di, di, di, di, di, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // EX
{di, pr, pr, in, in, in, pr, pr, pr, di, di, in, di, di, di, di, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // SY
{di, pr, pr, in, in, in, pr, pr, pr, di, di, in, in, in, di, di, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // IS
{in, pr, pr, in, in, in, pr, pr, pr, di, di, in, in, in, in, di, in, in, di, di, pr, ci, pr, in, in, in, in, in, di}, // PR
{in, pr, pr, in, in, in, pr, pr, pr, di, di, in, in, in, di, di, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // PO
{in, pr, pr, in, in, in, pr, pr, pr, in, in, in, in, in, di, in, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // NU
{in, pr, pr, in, in, in, pr, pr, pr, di, di, in, in, in, di, in, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // AL
{in, pr, pr, in, in, in, pr, pr, pr, di, di, in, in, in, di, in, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // HL
{di, pr, pr, in, in, in, pr, pr, pr, di, in, di, di, di, di, in, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // ID
{di, pr, pr, in, in, in, pr, pr, pr, di, di, di, di, di, di, in, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // IN
{di, pr, pr, in, di, in, pr, pr, pr, di, di, in, di, di, di, di, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // HY
{di, pr, pr, in, di, in, pr, pr, pr, di, di, di, di, di, di, di, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // BA
{in, pr, pr, in, in, in, pr, pr, pr, in, in, in, in, in, in, in, in, in, in, in, pr, ci, pr, in, in, in, in, in, in}, // BB
{di, pr, pr, in, in, in, pr, pr, pr, di, di, di, di, di, di, di, in, in, di, pr, pr, ci, pr, di, di, di, di, di, di}, // B2
{di, di, di, di, di, di, di, di, di, di, di, di, di, di, di, di, di, di, di, di, pr, di, di, di, di, di, di, di, di}, // ZW
{in, pr, pr, in, in, in, pr, pr, pr, di, di, in, in, in, di, in, in, in, di, di, pr, ci, pr, di, di, di, di, di, di}, // CM
{in, pr, pr, in, in, in, pr, pr, pr, in, in, in, in, in, in, in, in, in, in, in, pr, ci, pr, in, in, in, in, in, in}, // WJ
{di, pr, pr, in, in, in, pr, pr, pr, di, in, di, di, di, di, in, in, in, di, di, pr, ci, pr, di, di, di, in, in, di}, // H2
{di, pr, pr, in, in, in, pr, pr, pr, di, in, di, di, di, di, in, in, in, di, di, pr, ci, pr, di, di, di, di, in, di}, // H3
{di, pr, pr, in, in, in, pr, pr, pr, di, in, di, di, di, di, in, in, in, di, di, pr, ci, pr, in, in, in, in, di, di}, // JL
{di, pr, pr, in, in, in, pr, pr, pr, di, in, di, di, di, di, in, in, in, di, di, pr, ci, pr, di, di, di, in, in, di}, // JV
{di, pr, pr, in, in, in, pr, pr, pr, di, in, di, di, di, di, in, in, in, di, di, pr, ci, pr, di, di, di, di, in, di}, // JT
{di, pr, pr, in, in, in, pr, pr, pr, di, di, di, di, di, di, di, in, in, di, di, pr, ci, pr, di, di, di, di, di, in}, // RI
}
// ClassResolver returns a line breaking class for the given rune.
type ClassResolver func(rune) BreakClass
// classResolver is the default ClassResolver.
func classResolver(r rune) BreakClass {
cls := Class(r)
// LB1: Resolve AI, CB, CJ, SA, SG, and XX into other classes.
// We are using the generic resolution proposed in UAX #14.
switch cls {
case ClassAI, ClassSG, ClassXX:
cls = ClassAL
case ClassCJ:
cls = ClassNS
case ClassSA:
if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Mc, r) {
cls = ClassCM
} else {
cls = ClassAL
}
case ClassCB:
// TODO: CB should be left to be resolved later, according to
// LB9, LB10 and LB20.
// For now we are using a placeholder; maybe not the best one.
cls = ClassID
}
return cls
}
// NewScanner returns a line breaking scanner to scan the given runes.
func NewScanner(r []rune) *Scanner {
return &Scanner{
Resolver: classResolver,
Table: pairTable,
runes: r,
}
}
// Scanner scans a text looking for line breaking opportunities.
type Scanner struct {
Resolver ClassResolver // returns a line breaking class for a rune
Table PairTable // returns an action for adjacent line breaking classes
runes []rune // input
pos int // position of the input when moving forward
prevClass BreakClass // previous rune class when moving forward
err error // possible error; freezes the scanner
}
// Next finds the next line breaking action in the input.
//
// It can be called successively to find all actions until the end
// of the input, when it returns io.EOF as error.
func (s *Scanner) Next() (pos int, action BreakAction, err error) {
var cls BreakClass
// Read start of text and set prevClass.
if s.pos == 0 {
cls, err = s.nextClass()
if err != nil {
// Always break at the end.
action = BreakMandatory
return
}
s.prevClass = cls
action = BreakProhibited
return
}
// Now read the next rune and decide what to do.
// We handle spaces manually, and anything else using PairTable.
pos = s.pos
cls, err = s.nextClass()
if err != nil {
// Always break at the end.
action = BreakMandatory
return
}
if !(s.prevClass != ClassBK && (s.prevClass != ClassCR || cls == ClassLF)) {
return
}
switch cls {
case ClassBK, ClassLF:
// handle BK, NL and LF explicitly
action = BreakProhibited
s.prevClass = ClassBK
return
case ClassCR:
// handle CR explicitly
action = BreakProhibited
s.prevClass = ClassCR
return
case ClassSP:
// handle spaces explicitly
// apply rule LB7: × SP
// do not update s.prevClass
action = BreakProhibited
return
}
// Lookup pair table information in PairTable[before][after].
action = s.Table.Action(s.prevClass, cls)
switch action {
case BreakIndirect:
// resolve indirect break
// if context is A SP + B
// break opportunity
// else
// no break opportunity
switch s.prevClass {
case ClassSP:
action = BreakIndirect
default:
action = BreakProhibited
}
case BreakCombiningIndirect:
// resolve combining mark break
switch s.prevClass {
case ClassSP:
// new: space is not a base
// apply rule SP ÷
action = BreakCombiningIndirect
default:
// do not break before CM
action = BreakProhibited
// apply rule LB9: X CM * -> X
// do not update cls
return
}
case BreakCombiningProhibited:
// this is the case OP SP* CM
// no break allowed
action = BreakProhibited
if s.prevClass == ClassSP {
// apply rule LB9: X CM* -> X
// do not update cls
return
}
}
// Save cls of "before" character.
s.prevClass = cls
return
}
// nextClass returns the next line breaking class in the input.
func (s *Scanner) nextClass() (cls BreakClass, err error) {
if s.err != nil {
err = s.err
return
}
if s.pos >= len(s.runes) {
s.err = io.EOF
err = s.err
return
}
sot := s.pos == 0
cls = s.Resolver(s.runes[s.pos])
s.pos += 1
switch cls {
case ClassNL:
// Equivalent; simplifies code:
// "The NL class acts like BK in all respects (there is a
// mandatory break after any NEL character)."
cls = ClassBK
case ClassSP:
// Special case for start of text.
if sot {
cls = ClassWJ
}
case ClassLF:
// Special case for start of text.
if sot {
cls = ClassBK
}
}
return
}
// last finds the last line breaking action in the input.
//
// It can be called successively to find all actions until the start
// of the input, when it returns io.EOF as error (really meaning SOF).
func (s *Scanner) last() (pos int, action BreakAction, err error) {
// TODO
return
}