Package com.ibm.icu.impl.coll
Class CollationBuilder
java.lang.Object
com.ibm.icu.impl.coll.CollationRuleParser.Sink
com.ibm.icu.impl.coll.CollationBuilder
-
Nested Class Summary
Nested ClassesModifier and TypeClassDescriptionprivate static final class
private static final class
-
Field Summary
FieldsModifier and TypeFieldDescriptionprivate CollationTailoring
private CollationData
private long[]
private int
private static final UnicodeSet
private CollationDataBuilder
private static final boolean
private boolean
private Normalizer2
private static final int
Node bit 6 is set on a primary node if there are nodes with secondary values below the common secondary weight (05).private static final int
Node bit 5 is set on a primary or secondary node if there are nodes with tertiary values below the common tertiary weight (05).private static final int
Node bit 3 distinguishes a tailored node, which has no weight value, from a node with an explicit (root or default) weight.private static int
private static final int
At most 1M nodes, limited by the 20 bits in node bit fields.private Normalizer2Impl
private Normalizer2
private UVector64
Data structure for assigning tailored weights and CEs.private UnicodeSet
private CollationRootElements
private UVector32
Indexes of nodes with root primary weights, sorted by primary.private long
-
Constructor Summary
Constructors -
Method Summary
Modifier and TypeMethodDescriptionprivate int
addIfDifferent
(CharSequence prefix, CharSequence str, long[] newCEs, int newCEsLength, int ce32) private int
addOnlyClosure
(CharSequence nfdPrefix, CharSequence nfdString, long[] newCEs, int newCEsLength, int ce32) (package private) void
addRelation
(int strength, CharSequence prefix, CharSequence str, CharSequence extension) Implements CollationRuleParser.Sink.(package private) void
addReset
(int strength, CharSequence str) Implements CollationRuleParser.Sink.private void
addTailComposites
(CharSequence nfdPrefix, CharSequence nfdString) private int
addWithClosure
(CharSequence nfdPrefix, CharSequence nfdString, long[] newCEs, int newCEsLength, int ce32) Adds the mapping and its canonical closure.private static final int
alignWeightRight
(int w) private static final int
binarySearchForRootPrimaryNode
(int[] rootPrimaryIndexes, int length, long[] nodes, long p) Like Java Collections.binarySearch(List, key, Comparator).private static int
ceStrength
(long ce) private static long
changeNodeNextIndex
(long node, int next) private static long
changeNodePreviousIndex
(long node, int previous) private void
private static int
countTailoredNodes
(long[] nodesArray, int i, int strength) Counts the tailored nodes of the given strength up to the next node which is either stronger or has an explicit weight of this strength.private boolean
equalSubSequences
(CharSequence left, int leftStart, CharSequence right, int rightStart) private void
Replaces temporary CEs with the final CEs they point to.private int
findCommonNode
(int index, int strength) Finds the node which implies or contains a common=05 weight of the given strength (secondary or tertiary), if the current node is stronger.private int
findOrInsertNodeForCEs
(int strength) Picks one of the current CEs and finds or inserts a node in the graph for the CE + strength.private int
findOrInsertNodeForPrimary
(long p) Finds or inserts the node for a root CE's primary weight.private int
findOrInsertNodeForRootCE
(long ce, int strength) private int
findOrInsertWeakNode
(int index, int weight16, int level) Finds or inserts the node for a secondary or tertiary weight.private long
private int
getWeight16Before
(int index, long node, int level) Returns the secondary or tertiary weight preceding the current node's weight.private boolean
private boolean
private static int
indexFromTempCE
(long tempCE) private static int
indexFromTempCE32
(int tempCE32) private int
insertNodeBetween
(int index, int nextIndex, long node) Inserts a new node into the list, between list-adjacent items.private int
insertTailoredNodeAfter
(int index, int strength) Makes and inserts a new tailored node into the list, after the one at index.private boolean
private static boolean
isTailoredNode
(long node) private static boolean
isTempCE
(long ce) private static boolean
isTempCE32
(int ce32) private void
Walks the tailoring graph and overwrites tailored nodes with new CEs.private boolean
mergeCompositeIntoString
(CharSequence nfdString, int indexAfterLastStarter, int composite, CharSequence decomp, StringBuilder newNFDString, StringBuilder newString) private static int
nextIndexFromNode
(long node) private static long
nodeFromNextIndex
(int next) private static long
nodeFromPreviousIndex
(int previous) private static long
nodeFromStrength
(int strength) private static long
nodeFromWeight16
(int weight16) private static long
nodeFromWeight32
(long weight32) private static boolean
nodeHasAnyBefore
(long node) private static boolean
nodeHasBefore2
(long node) private static boolean
nodeHasBefore3
(long node) (package private) void
optimize
(UnicodeSet set) Implements CollationRuleParser.Sink.parseAndBuild
(String ruleString) private static int
previousIndexFromNode
(long node) private static boolean
sameCEs
(long[] ces1, int ces1Length, long[] ces2, int ces2Length) private void
setCaseBits
(CharSequence nfdString) private static int
strengthFromNode
(long node) private static int
strengthFromTempCE
(long tempCE) (package private) void
Implements CollationRuleParser.Sink.private static long
tempCEFromIndexAndStrength
(int index, int strength) Encodes "temporary CE" data into a CE that fits into the CE32 data structure, with 2-byte primary, 1-byte secondary and 6-bit tertiary, with valid CE byte values.private static int
weight16FromNode
(long node) private static long
weight32FromNode
(long node)
-
Field Details
-
DEBUG
private static final boolean DEBUG- See Also:
-
kClosureLoopLimit
private static int kClosureLoopLimit -
COMPOSITES
-
MAX_INDEX
private static final int MAX_INDEXAt most 1M nodes, limited by the 20 bits in node bit fields.- See Also:
-
HAS_BEFORE2
private static final int HAS_BEFORE2Node bit 6 is set on a primary node if there are nodes with secondary values below the common secondary weight (05).- See Also:
-
HAS_BEFORE3
private static final int HAS_BEFORE3Node bit 5 is set on a primary or secondary node if there are nodes with tertiary values below the common tertiary weight (05).- See Also:
-
IS_TAILORED
private static final int IS_TAILOREDNode bit 3 distinguishes a tailored node, which has no weight value, from a node with an explicit (root or default) weight.- See Also:
-
nfd
-
fcd
-
nfcImpl
-
base
-
baseData
-
rootElements
-
variableTop
private long variableTop -
dataBuilder
-
fastLatinEnabled
private boolean fastLatinEnabled -
optimizeSet
-
ces
private long[] ces -
cesLength
private int cesLength -
rootPrimaryIndexes
Indexes of nodes with root primary weights, sorted by primary. Compact form of a TreeMap from root primary to node index. This is a performance optimization for finding reset positions. Without this, we would have to search through the entire nodes list. It also allows storing root primary weights in list head nodes, without previous index, leaving room in root primary nodes for 32-bit primary weights. -
nodes
Data structure for assigning tailored weights and CEs. Doubly-linked lists of nodes in mostly collation order. Each list starts with a root primary node and ends with a nextIndex of 0. When there are any nodes in the list, then there is always a root primary node at index 0. This allows some code not to have to check explicitly for nextIndex==0. Root primary nodes have 32-bit weights but do not have previous indexes. All other nodes have at most 16-bit weights and do have previous indexes. Nodes with explicit weights store root collator weights, or default weak weights (e.g., secondary 05) for stronger nodes. "Tailored" nodes, with the IS_TAILORED bit set, do not store explicit weights but rather create a difference of a certain strength from the preceding node. A root node is followed by either - a root/default node of the same strength, or - a root/default node of the next-weaker strength, or - a tailored node of the same strength. A node of a given strength normally implies "common" weights on weaker levels. A node with HAS_BEFORE2 must be immediately followed by a secondary node with an explicit below-common weight, then a secondary tailored node, and later an explicit common-secondary node. The below-common weight can be a root weight, or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight or before the lowest root weight. (&[before 2] resets to an explicit secondary node so that the following addRelation(secondary) tailors right after that. If we did not have this node and instead were to reset on the primary node, then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.) If the flag is not set, then there are no explicit secondary nodes with the common or lower weights. Same for HAS_BEFORE3 for tertiary nodes and weights. A node must not have both flags set. Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs which point to stable indexes in this list, and temporary CEs stored in a CollationDataBuilder only point to tailored nodes. A temporary CE in the ces[] array may point to a non-tailored reset-before-position node, until the next relation is added. At the end, the tailored weights are allocated as necessary, then the tailored nodes are replaced with final CEs, and the CollationData is rewritten by replacing temporary CEs with final ones. We cannot simply insert new nodes in the middle of the array because that would invalidate the indexes stored in existing temporary CEs. We need to use a linked graph with stable indexes to existing nodes. A doubly-linked list seems easiest to maintain. Each node is stored as an long, with its fields stored as bit fields. Root primary node: - primary weight: 32 bits 63..32 - reserved/unused/zero: 4 bits 31..28 Weaker root nodes & tailored nodes: - a weight: 16 bits 63..48 + a root or default weight for a non-tailored node + unused/zero for a tailored node - index to the previous node: 20 bits 47..28 All types of nodes: - index to the next node: 20 bits 27..8 + nextIndex=0 in last node per root-primary list - reserved/unused/zero bits: bits 7, 4, 2 - HAS_BEFORE2: bit 6 - HAS_BEFORE3: bit 5 - IS_TAILORED: bit 3 - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0 We could allocate structs with pointers, but we would have to store them in a pointer list so that they can be indexed from temporary CEs, and they would require more memory allocations.
-
-
Constructor Details
-
CollationBuilder
-
-
Method Details
-
parseAndBuild
- Throws:
ParseException
-
addReset
Implements CollationRuleParser.Sink.- Specified by:
addReset
in classCollationRuleParser.Sink
-
getWeight16Before
private int getWeight16Before(int index, long node, int level) Returns the secondary or tertiary weight preceding the current node's weight. node=nodes[index]. -
getSpecialResetPosition
-
addRelation
Implements CollationRuleParser.Sink.- Specified by:
addRelation
in classCollationRuleParser.Sink
-
findOrInsertNodeForCEs
private int findOrInsertNodeForCEs(int strength) Picks one of the current CEs and finds or inserts a node in the graph for the CE + strength. -
findOrInsertNodeForRootCE
private int findOrInsertNodeForRootCE(long ce, int strength) -
binarySearchForRootPrimaryNode
private static final int binarySearchForRootPrimaryNode(int[] rootPrimaryIndexes, int length, long[] nodes, long p) Like Java Collections.binarySearch(List, key, Comparator).- Returns:
- the index>=0 where the item was found, or the index<0 for inserting the string at ~index in sorted order (index into rootPrimaryIndexes)
-
findOrInsertNodeForPrimary
private int findOrInsertNodeForPrimary(long p) Finds or inserts the node for a root CE's primary weight. -
findOrInsertWeakNode
private int findOrInsertWeakNode(int index, int weight16, int level) Finds or inserts the node for a secondary or tertiary weight. -
insertTailoredNodeAfter
private int insertTailoredNodeAfter(int index, int strength) Makes and inserts a new tailored node into the list, after the one at index. Skips over nodes of weaker strength to maintain collation order ("postpone insertion").- Returns:
- the new node's index
-
insertNodeBetween
private int insertNodeBetween(int index, int nextIndex, long node) Inserts a new node into the list, between list-adjacent items. The node's previous and next indexes must not be set yet.- Returns:
- the new node's index
-
findCommonNode
private int findCommonNode(int index, int strength) Finds the node which implies or contains a common=05 weight of the given strength (secondary or tertiary), if the current node is stronger. Skips weaker nodes and tailored nodes if the current node is stronger and is followed by an explicit-common-weight node. Always returns the input index if that node is no stronger than the given strength. -
setCaseBits
-
suppressContractions
Implements CollationRuleParser.Sink.- Overrides:
suppressContractions
in classCollationRuleParser.Sink
-
optimize
Implements CollationRuleParser.Sink.- Overrides:
optimize
in classCollationRuleParser.Sink
-
addWithClosure
private int addWithClosure(CharSequence nfdPrefix, CharSequence nfdString, long[] newCEs, int newCEsLength, int ce32) Adds the mapping and its canonical closure. Takes ce32=dataBuilder.encodeCEs(...) so that the data builder need not re-encode the CEs multiple times. -
addOnlyClosure
private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString, long[] newCEs, int newCEsLength, int ce32) -
addTailComposites
-
mergeCompositeIntoString
private boolean mergeCompositeIntoString(CharSequence nfdString, int indexAfterLastStarter, int composite, CharSequence decomp, StringBuilder newNFDString, StringBuilder newString) -
equalSubSequences
private boolean equalSubSequences(CharSequence left, int leftStart, CharSequence right, int rightStart) -
ignorePrefix
-
ignoreString
-
isFCD
-
closeOverComposites
private void closeOverComposites() -
addIfDifferent
private int addIfDifferent(CharSequence prefix, CharSequence str, long[] newCEs, int newCEsLength, int ce32) -
sameCEs
private static boolean sameCEs(long[] ces1, int ces1Length, long[] ces2, int ces2Length) -
alignWeightRight
private static final int alignWeightRight(int w) -
makeTailoredCEs
private void makeTailoredCEs()Walks the tailoring graph and overwrites tailored nodes with new CEs. After this, the graph is destroyed. The nodes array can then be used only as a source of tailored CEs. -
countTailoredNodes
private static int countTailoredNodes(long[] nodesArray, int i, int strength) Counts the tailored nodes of the given strength up to the next node which is either stronger or has an explicit weight of this strength. -
finalizeCEs
private void finalizeCEs()Replaces temporary CEs with the final CEs they point to. -
tempCEFromIndexAndStrength
private static long tempCEFromIndexAndStrength(int index, int strength) Encodes "temporary CE" data into a CE that fits into the CE32 data structure, with 2-byte primary, 1-byte secondary and 6-bit tertiary, with valid CE byte values. The index must not exceed 20 bits (0xfffff). The strength must fit into 2 bits (Collator.PRIMARY..Collator.QUATERNARY). Temporary CEs are distinguished from real CEs by their use of secondary weights 06..45 which are otherwise reserved for compressed sort keys. The case bits are unused and available. -
indexFromTempCE
private static int indexFromTempCE(long tempCE) -
strengthFromTempCE
private static int strengthFromTempCE(long tempCE) -
isTempCE
private static boolean isTempCE(long ce) -
indexFromTempCE32
private static int indexFromTempCE32(int tempCE32) -
isTempCE32
private static boolean isTempCE32(int ce32) -
ceStrength
private static int ceStrength(long ce) -
nodeFromWeight32
private static long nodeFromWeight32(long weight32) -
nodeFromWeight16
private static long nodeFromWeight16(int weight16) -
nodeFromPreviousIndex
private static long nodeFromPreviousIndex(int previous) -
nodeFromNextIndex
private static long nodeFromNextIndex(int next) -
nodeFromStrength
private static long nodeFromStrength(int strength) -
weight32FromNode
private static long weight32FromNode(long node) -
weight16FromNode
private static int weight16FromNode(long node) -
previousIndexFromNode
private static int previousIndexFromNode(long node) -
nextIndexFromNode
private static int nextIndexFromNode(long node) -
strengthFromNode
private static int strengthFromNode(long node) -
nodeHasBefore2
private static boolean nodeHasBefore2(long node) -
nodeHasBefore3
private static boolean nodeHasBefore3(long node) -
nodeHasAnyBefore
private static boolean nodeHasAnyBefore(long node) -
isTailoredNode
private static boolean isTailoredNode(long node) -
changeNodePreviousIndex
private static long changeNodePreviousIndex(long node, int previous) -
changeNodeNextIndex
private static long changeNodeNextIndex(long node, int next)
-