|
| 1 | +# |
| 2 | +# Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | +# contributor license agreements. See the NOTICE file distributed with |
| 4 | +# this work for additional information regarding copyright ownership. |
| 5 | +# The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | +# (the "License"); you may not use this file except in compliance with |
| 7 | +# the License. You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | +# |
| 17 | +# Based on Default.rbbi, the default RBBI rules, based on UAX#29. |
| 18 | +# Added dashes to $MidLetter, so that words aren't broken on single dashes. |
| 19 | +# |
| 20 | + |
| 21 | +!!chain; |
| 22 | + |
| 23 | +# |
| 24 | +# Character Class Definitions. |
| 25 | +# |
| 26 | + |
| 27 | +$CR = [\p{Word_Break = CR}]; |
| 28 | +$LF = [\p{Word_Break = LF}]; |
| 29 | +$Newline = [\p{Word_Break = Newline}]; |
| 30 | +$Extend = [\p{Word_Break = Extend}]; |
| 31 | +$Format = [\p{Word_Break = Format}]; |
| 32 | +$Katakana = [\p{Word_Break = Katakana}]; |
| 33 | +$ALetter = [\p{Word_Break = ALetter}]; |
| 34 | +$MidNumLet = [\p{Word_Break = MidNumLet}]; |
| 35 | +# Don't use [:Dash:] here - it contains lots of chars that should continue to trigger word breaks |
| 36 | +$Dash = [\N{HYPHEN-MINUS} |
| 37 | + \N{HYPHEN} |
| 38 | + \N{EN DASH} |
| 39 | + \N{MINUS SIGN} |
| 40 | + \N{SMALL HYPHEN-MINUS} |
| 41 | + \N{FULLWIDTH HYPHEN-MINUS}]; |
| 42 | +$MidLetter = [\p{Word_Break = MidLetter}$Dash]; # Don't break on (single) hyphen |
| 43 | +$MidNum = [\p{Word_Break = MidNum}]; |
| 44 | +$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]]; |
| 45 | +$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; |
| 46 | + |
| 47 | +$Digit = [0-9]; |
| 48 | +$Letter = [a-zA-Z]; |
| 49 | +$Project = (lucene|LUCENE|Lucene|solr|SOLR|Solr|infra|Infra|INFRA|tika|TIKA|Tika); |
| 50 | +$Hyphen = [\N{HYPHEN-MINUS}]; |
| 51 | + |
| 52 | + |
| 53 | +# Dictionary character set, for triggering language-based break engines. Currently |
| 54 | +# limited to LineBreak=Complex_Context. Note that this set only works in Unicode |
| 55 | +# 5.0 or later as the definition of Complex_Context was corrected to include all |
| 56 | +# characters requiring dictionary break. |
| 57 | + |
| 58 | +$dictionary = [:LineBreak = Complex_Context:]; |
| 59 | +$Control = [\p{Grapheme_Cluster_Break = Control}]; |
| 60 | +$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not |
| 61 | + # include the dictionary characters. |
| 62 | + |
| 63 | +# |
| 64 | +# Rules 4 Ignore Format and Extend characters, |
| 65 | +# except when they appear at the beginning of a region of text. |
| 66 | +# |
| 67 | +$KatakanaEx = $Katakana ($Extend | $Format)*; |
| 68 | +$ALetterEx = $ALetterPlus ($Extend | $Format)*; |
| 69 | +$MidNumLetEx = $MidNumLet ($Extend | $Format)*; |
| 70 | +$MidLetterEx = $MidLetter ($Extend | $Format)*; |
| 71 | +$MidNumEx = $MidNum ($Extend | $Format)*; |
| 72 | +$NumericEx = $Numeric ($Extend | $Format)*; |
| 73 | +$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; |
| 74 | + |
| 75 | +$Hiragana = [\p{script=Hiragana}]; |
| 76 | +$Ideographic = [\p{Ideographic}]; |
| 77 | +$HiraganaEx = $Hiragana ($Extend | $Format)*; |
| 78 | +$IdeographicEx = $Ideographic ($Extend | $Format)*; |
| 79 | + |
| 80 | +## ------------------------------------------------- |
| 81 | + |
| 82 | +!!forward; |
| 83 | + |
| 84 | + |
| 85 | +# Rule 3 - CR x LF |
| 86 | +# |
| 87 | +$CR $LF; |
| 88 | + |
| 89 | +# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning |
| 90 | +# of a region of Text. The rule here comes into play when the start of text |
| 91 | +# begins with a group of Format chars, or with a "word" consisting of a single |
| 92 | +# char that is not in any of the listed word break categories followed by |
| 93 | +# format char(s). |
| 94 | +[^$CR $LF $Newline]? ($Extend | $Format)+; |
| 95 | + |
| 96 | +$NumericEx {100}; |
| 97 | +$ALetterEx {200}; |
| 98 | +$KatakanaEx {300}; # note: these status values override those from rule 5 |
| 99 | +$HiraganaEx {300}; # by virtual of being numerically larger. |
| 100 | +$IdeographicEx {400}; # |
| 101 | + |
| 102 | +# |
| 103 | +# rule 5 |
| 104 | +# Do not break between most letters. |
| 105 | +# |
| 106 | +$ALetterEx $ALetterEx {200}; |
| 107 | + |
| 108 | +# rule 6 and 7 |
| 109 | +#$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; |
| 110 | + |
| 111 | +# rule 8 |
| 112 | + |
| 113 | +$NumericEx $NumericEx {100}; |
| 114 | + |
| 115 | +# rule 9 |
| 116 | + |
| 117 | +$ALetterEx $NumericEx {200}; |
| 118 | + |
| 119 | +# MKM: from Rob: |
| 120 | +$Project $Hyphen ($Digit)* {900}; |
| 121 | + |
| 122 | +# rule 10 |
| 123 | + |
| 124 | +$NumericEx $ALetterEx {200}; |
| 125 | + |
| 126 | +# rule 11 and 12 |
| 127 | + |
| 128 | +$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; |
| 129 | + |
| 130 | +# rule 13 |
| 131 | + |
| 132 | +$KatakanaEx $KatakanaEx {300}; |
| 133 | + |
| 134 | +# rule 13a/b |
| 135 | + |
| 136 | +$ALetterEx $ExtendNumLetEx {200}; # (13a) |
| 137 | +$NumericEx $ExtendNumLetEx {100}; # (13a) |
| 138 | +$KatakanaEx $ExtendNumLetEx {300}; # (13a) |
| 139 | +$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) |
| 140 | + |
| 141 | +$ExtendNumLetEx $ALetterEx {200}; # (13b) |
| 142 | +$ExtendNumLetEx $NumericEx {100}; # (13b) |
| 143 | +$ExtendNumLetEx $KatakanaEx {300}; # (13b) |
0 commit comments