1 /* Copyright 2004 The Apache Software Foundation 2 * 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 package org.apache.xmlbeans.impl.regex; 17 18 import java.text.CharacterIterator; 19 20 /** 21 * A regular expression matching engine using Non-deterministic Finite Automaton (NFA). 22 * This engine does not conform to the POSIX regular expression. 23 * 24 * <hr width="50%"> 25 * <h3>How to use</h3> 26 * 27 * <dl> 28 * <dt>A. Standard way 29 * <dd> 30 * <pre> 31 * RegularExpression re = new RegularExpression(<var>regex</var>); 32 * if (re.matches(text)) { ... } 33 * </pre> 34 * 35 * <dt>B. Capturing groups 36 * <dd> 37 * <pre> 38 * RegularExpression re = new RegularExpression(<var>regex</var>); 39 * Match match = new Match(); 40 * if (re.matches(text, match)) { 41 * ... // You can refer captured texts with methods of the <code>Match</code> class. 42 * } 43 * </pre> 44 * 45 * </dl> 46 * 47 * <h4>Case-insensitive matching</h4> 48 * <pre> 49 * RegularExpression re = new RegularExpression(<var>regex</var>, "i"); 50 * if (re.matches(text) >= 0) { ...} 51 * </pre> 52 * 53 * <h4>Options</h4> 54 * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a> 55 * or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>. 56 * This <var>options</var> parameter consists of the following characters. 57 * </p> 58 * <dl> 59 * <dt><a name="I_OPTION"><code>"i"</code></a> 60 * <dd>This option indicates case-insensitive matching. 61 * <dt><a name="M_OPTION"><code>"m"</code></a> 62 * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text. 63 * <dt><a name="S_OPTION"><code>"s"</code></a> 64 * <dd class="REGEX"><kbd>.</kbd> matches any one character. 65 * <dt><a name="U_OPTION"><code>"u"</code></a> 66 * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \< \></kbd> as becoming to Unicode. 67 * <dt><a name="W_OPTION"><code>"w"</code></a> 68 * <dd class="REGEX">By this option, <kbd>\b \B \< \></kbd> are processed with the method of 69 * 'Unicode Regular Expression Guidelines' Revision 4. 70 * When "w" and "u" are specified at the same time, 71 * <kbd>\b \B \< \></kbd> are processed for the "w" option. 72 * <dt><a name="COMMA_OPTION"><code>","</code></a> 73 * <dd>The parser treats a comma in a character class as a range separator. 74 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option. 75 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option. 76 * 77 * <dt><a name="X_OPTION"><code>"X"</code></a> 78 * <dd class="REGEX"> 79 * By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>. 80 * The <code>match()</code> method does not do subsring matching 81 * but entire string matching. 82 * 83 * </dl> 84 * 85 * <hr width="50%"> 86 * <h3>Syntax</h3> 87 * <table border="1" bgcolor="#ddeeff"> 88 * <tr> 89 * <td> 90 * <h4>Differences from the Perl 5 regular expression</h4> 91 * <ul> 92 * <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.) 93 * <li>Supports subtraction, union, and intersection operations for character classes. 94 * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations), 95 * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>, 96 * <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>, 97 * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>, 98 * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd> 99 * </ul> 100 * </td> 101 * </tr> 102 * </table> 103 * 104 * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P> 105 * <ul> 106 * <li>Character 107 * <dl> 108 * <dt class="REGEX"><kbd>.</kbd> (A period) 109 * <dd>Matches any one character except the following characters. 110 * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D), 111 * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028) 112 * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates. 113 * <dd>When <a href="#S_OPTION">the "s" option</a> is specified, 114 * it matches any character including the above four characters. 115 * 116 * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd> 117 * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A), 118 * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009) 119 * 120 * <dt class="REGEX"><kbd>\c</kbd><var>C</var> 121 * <dd>Matches a control character. 122 * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>', 123 * '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'. 124 * It matches a control character of which the character code is less than 125 * the character code of the <var>C</var> by 0x0040. 126 * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A), 127 * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B). 128 * 129 * <dt class="REGEX">a non-meta character 130 * <dd>Matches the character. 131 * 132 * <dt class="REGEX"><KBD>\</KBD> + a meta character 133 * <dd>Matches the meta character. 134 * 135 * <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> 136 * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode. 137 * You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and 138 * variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>. 139 * 140 * <!-- 141 * <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var> 142 * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode. 143 * --> 144 * 145 * <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var> 146 * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode. 147 * 148 * <dt class="REGEX"><kbd>\g</kbd> 149 * <dd>Matches a grapheme. 150 * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd> 151 * 152 * <dt class="REGEX"><kbd>\X</kbd> 153 * <dd class="REGEX">Matches a combining character sequence. 154 * It is equivalent to <kbd>(?:\PM\pM*)</kbd> 155 * </dl> 156 * </li> 157 * 158 * <li>Character class 159 * <dl> 160 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>) 161 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>) 162 * <dd>Positive character class. It matches a character in ranges. 163 * <dd><var>R<sub>n</sub></var>: 164 * <ul> 165 * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>) 166 * <p>This range matches the character. 167 * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var> 168 * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and <= <var>C<sub>2</sub></var>'s code point. 169 + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>, 170 + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd> 171 * <p>... 172 * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd> 173 * <p>These expressions specifies the same ranges as the following expressions. 174 * </ul> 175 * <p class="REGEX">Enumerated ranges are merged (union operation). 176 * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd> 177 * 178 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>) 179 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>) 180 * <dd>Negative character class. It matches a character not in ranges. 181 * 182 * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd> 183 * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.) 184 * <dd>Subtraction or union or intersection for character classes. 185 * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>. 186 * <dd>The result of this operations is a <u>positive character class</u> 187 * even if an expression includes any negative character classes. 188 * You have to take care on this in case-insensitive matching. 189 * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>, 190 * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching. 191 * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because 192 * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>' 193 * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>. 194 * 195 * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt> 196 * <dd>Character class subtraction for the XML Schema. 197 * You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>. 198 * 199 * <dt class="REGEX"><kbd>\d</kbd> 200 * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>. 201 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 202 * <span class="REGEX"><kbd>\p{Nd}</kbd></span>. 203 * 204 * <dt class="REGEX"><kbd>\D</kbd> 205 * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd> 206 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 207 * <span class="REGEX"><kbd>\P{Nd}</kbd></span>. 208 * 209 * <dt class="REGEX"><kbd>\s</kbd> 210 * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd> 211 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 212 * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>. 213 * 214 * <dt class="REGEX"><kbd>\S</kbd> 215 * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd> 216 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 217 * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>. 218 * 219 * <dt class="REGEX"><kbd>\w</kbd> 220 * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd> 221 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 222 * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>. 223 * 224 * <dt class="REGEX"><kbd>\W</kbd> 225 * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd> 226 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 227 * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>. 228 * 229 * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd> 230 * <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>. 231 * The following names are available: 232 * <dl> 233 * <dt>Unicode General Categories: 234 * <dd><kbd> 235 * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp, 236 * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So, 237 * </kbd> 238 * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters) 239 * <dt>Unicode Blocks: 240 * <dd><kbd> 241 * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B, 242 * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek, 243 * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati, 244 * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian, 245 * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation, 246 * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols, 247 * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators, 248 * Miscellaneous Technical, Control Pictures, Optical Character Recognition, 249 * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes, 250 * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana, 251 * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun, 252 * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs, 253 * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates, 254 * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms, 255 * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms, 256 * Small Form Variants, Arabic Presentation Forms-B, Specials, 257 * Halfwidth and Fullwidth Forms 258 * </kbd> 259 * <dt>Others: 260 * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>) 261 * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>) 262 * <dd><kbd>UNASSGINED</kbd> 263 * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>) 264 * </dl> 265 * 266 * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd> 267 * <dd>Matches one character not in the specified General Category or the specified Block. 268 * </dl> 269 * </li> 270 * 271 * <li>Selection and Quantifier 272 * <dl> 273 * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR> 274 * <dd>... 275 * 276 * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD> 277 * <dd>Matches 0 or more <var>X</var>. 278 * 279 * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD> 280 * <dd>Matches 1 or more <var>X</var>. 281 * 282 * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD> 283 * <dd>Matches 0 or 1 <var>X</var>. 284 * 285 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd> 286 * <dd>Matches <var>number</var> times. 287 * 288 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd> 289 * <dd>... 290 * 291 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd> 292 * <dd>... 293 * 294 * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd> 295 * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd> 296 * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd> 297 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd> 298 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd> 299 * <dd>Non-greedy matching. 300 * </dl> 301 * </li> 302 * 303 * <li>Grouping, Capturing, and Back-reference 304 * <dl> 305 * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD> 306 * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>". 307 * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>", 308 * you have to write "<KBD>(?:foo)+</KBD>". 309 * 310 * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD> 311 * <dd>Grouping with capturing. 312 * It make a group and applications can know 313 * where in target text a group matched with methods of a <code>Match</code> instance 314 * after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>. 315 * The 0th group means whole of this regular expression. 316 * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis. 317 * 318 * <p>For instance, a regular expression is 319 * "<FONT color=blue><KBD> *([^<:]*) +<([^>]*)> *</KBD></FONT>" 320 * and target text is 321 * "<FONT color=red><KBD>From: TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>": 322 * <ul> 323 * <li><code>Match.getCapturedText(0)</code>: 324 * "<FONT color=red><KBD> TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>" 325 * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>" 326 * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>" 327 * </ul> 328 * 329 * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd> 330 * <dd> 331 * 332 * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd> 333 * <dd>Independent expression group. ................ 334 * 335 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd> 336 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd> 337 * <dd>............................ 338 * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'. 339 * Note that it can not contain 'u'. 340 * 341 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd> 342 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd> 343 * <dd>...... 344 * <dd>These expressions must be at the beginning of a group. 345 * </dl> 346 * </li> 347 * 348 * <li>Anchor 349 * <dl> 350 * <dt class="REGEX"><kbd>\A</kbd> 351 * <dd>Matches the beginnig of the text. 352 * 353 * <dt class="REGEX"><kbd>\Z</kbd> 354 * <dd>Matches the end of the text, or before an EOL character at the end of the text, 355 * or CARRIAGE RETURN + LINE FEED at the end of the text. 356 * 357 * <dt class="REGEX"><kbd>\z</kbd> 358 * <dd>Matches the end of the text. 359 * 360 * <dt class="REGEX"><kbd>^</kbd> 361 * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>. 362 * <dd>When <a href="#M_OPTION">a "m" option</a> is set, 363 * it matches the beginning of the text, or after one of EOL characters ( 364 * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028), 365 * PARAGRAPH SEPARATOR (U+2029).) 366 * 367 * <dt class="REGEX"><kbd>$</kbd> 368 * <dd>Matches the end of the text, or before an EOL character at the end of the text, 369 * or CARRIAGE RETURN + LINE FEED at the end of the text. 370 * <dd>When <a href="#M_OPTION">a "m" option</a> is set, 371 * it matches the end of the text, or before an EOL character. 372 * 373 * <dt class="REGEX"><kbd>\b</kbd> 374 * <dd>Matches word boundary. 375 * (See <a href="#W_OPTION">a "w" option</a>) 376 * 377 * <dt class="REGEX"><kbd>\B</kbd> 378 * <dd>Matches non word boundary. 379 * (See <a href="#W_OPTION">a "w" option</a>) 380 * 381 * <dt class="REGEX"><kbd>\<</kbd> 382 * <dd>Matches the beginning of a word. 383 * (See <a href="#W_OPTION">a "w" option</a>) 384 * 385 * <dt class="REGEX"><kbd>\></kbd> 386 * <dd>Matches the end of a word. 387 * (See <a href="#W_OPTION">a "w" option</a>) 388 * </dl> 389 * </li> 390 * <li>Lookahead and lookbehind 391 * <dl> 392 * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd> 393 * <dd>Lookahead. 394 * 395 * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd> 396 * <dd>Negative lookahead. 397 * 398 * <dt class="REGEX"><kbd>(?<=</kbd><var>X</var><kbd>)</kbd> 399 * <dd>Lookbehind. 400 * <dd>(Note for text capturing......) 401 * 402 * <dt class="REGEX"><kbd>(?<!</kbd><var>X</var><kbd>)</kbd> 403 * <dd>Negative lookbehind. 404 * </dl> 405 * </li> 406 * 407 * <li>Misc. 408 * <dl> 409 * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>, 410 * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd> 411 * <dd>...... 412 * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd> 413 * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'. 414 * You can not write comments in character classes and before quantifiers. 415 * </dl> 416 * </li> 417 * </ul> 418 * 419 * 420 * <hr width="50%"> 421 * <h3>BNF for the regular expression</h3> 422 * <pre> 423 * regex ::= ('(?' options ')')? term ('|' term)* 424 * term ::= factor+ 425 * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )? 426 * | '(?#' [^)]* ')' 427 * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}' 428 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] 429 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X' 430 * | '(?>' regex ')' | '(?' options ':' regex ')' 431 * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')' 432 * options ::= [imsw]* ('-' [imsw]+)? 433 * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' 434 * looks ::= '(?=' regex ')' | '(?!' regex ')' 435 * | '(?<=' regex ')' | '(?<!' regex ')' 436 * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1 437 * category-block ::= '\' [pP] category-symbol-1 438 * | ('\p{' | '\P{') (category-symbol | block-name 439 * | other-properties) '}' 440 * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S' 441 * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo' 442 * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No' 443 * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs' 444 * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po' 445 * | 'Sm' | 'Sc' | 'Sk' | 'So' 446 * block-name ::= (See above) 447 * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED' 448 * character-1 ::= (any character except meta-characters) 449 * 450 * char-class ::= '[' ranges ']' 451 * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')' 452 * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+ 453 * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block 454 * | range-char | range-char '-' range-char 455 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2 456 * code-point ::= '\x' hex-char hex-char 457 * | '\x{' hex-char+ '}' 458 * <!-- | '\u005c u' hex-char hex-char hex-char hex-char 459 * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char 460 * hex-char ::= [0-9a-fA-F] 461 * character-2 ::= (any character except \[]-,) 462 * </pre> 463 * 464 * <hr width="50%"> 465 * <h3>TODO</h3> 466 * <ul> 467 * <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a> 468 * <ul> 469 * <li>2.4 Canonical Equivalents 470 * <li>Level 3 471 * </ul> 472 * <li>Parsing performance 473 * </ul> 474 * 475 * <hr width="50%"> 476 * 477 * @author TAMURA Kent <kent@trl.ibm.co.jp> 478 * @version $Id: RegularExpression.java 111285 2004-12-08 16:54:26Z cezar $ 479 */ 480 public class RegularExpression implements java.io.Serializable { 481 static final boolean DEBUG = false; 482 483 /** 484 * Compiles a token tree into an operation flow. 485 */ 486 private synchronized void compile(Token tok) { 487 if (this.operations != null) 488 return; 489 this.numberOfClosures = 0; 490 this.operations = this.compile(tok, null, false); 491 } 492 493 /** 494 * Converts a token to an operation. 495 */ 496 private Op compile(Token tok, Op next, boolean reverse) { 497 Op ret; 498 switch (tok.type) { 499 case Token.DOT: 500 ret = Op.createDot(); 501 ret.next = next; 502 break; 503 504 case Token.CHAR: 505 ret = Op.createChar(tok.getChar()); 506 ret.next = next; 507 break; 508 509 case Token.ANCHOR: 510 ret = Op.createAnchor(tok.getChar()); 511 ret.next = next; 512 break; 513 514 case Token.RANGE: 515 case Token.NRANGE: 516 ret = Op.createRange(tok); 517 ret.next = next; 518 break; 519 520 case Token.CONCAT: 521 ret = next; 522 if (!reverse) { 523 for (int i = tok.size()-1; i >= 0; i --) { 524 ret = compile(tok.getChild(i), ret, false); 525 } 526 } else { 527 for (int i = 0; i < tok.size(); i ++) { 528 ret = compile(tok.getChild(i), ret, true); 529 } 530 } 531 break; 532 533 case Token.UNION: 534 Op.UnionOp uni = Op.createUnion(tok.size()); 535 for (int i = 0; i < tok.size(); i ++) { 536 uni.addElement(compile(tok.getChild(i), next, reverse)); 537 } 538 ret = uni; // ret.next is null. 539 break; 540 541 case Token.CLOSURE: 542 case Token.NONGREEDYCLOSURE: 543 Token child = tok.getChild(0); 544 int min = tok.getMin(); 545 int max = tok.getMax(); 546 if (min >= 0 && min == max) { // {n} 547 ret = next; 548 for (int i = 0; i < min; i ++) { 549 ret = compile(child, ret, reverse); 550 } 551 break; 552 } 553 if (min > 0 && max > 0) 554 max -= min; 555 if (max > 0) { 556 // X{2,6} -> XX(X(X(XX?)?)?)? 557 ret = next; 558 for (int i = 0; i < max; i ++) { 559 Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE); 560 q.next = next; 561 q.setChild(compile(child, ret, reverse)); 562 ret = q; 563 } 564 } else { 565 Op.ChildOp op; 566 if (tok.type == Token.NONGREEDYCLOSURE) { 567 op = Op.createNonGreedyClosure(); 568 } else { // Token.CLOSURE 569 if (child.getMinLength() == 0) 570 op = Op.createClosure(this.numberOfClosures++); 571 else 572 op = Op.createClosure(-1); 573 } 574 op.next = next; 575 op.setChild(compile(child, op, reverse)); 576 ret = op; 577 } 578 if (min > 0) { 579 for (int i = 0; i < min; i ++) { 580 ret = compile(child, ret, reverse); 581 } 582 } 583 break; 584 585 case Token.EMPTY: 586 ret = next; 587 break; 588 589 case Token.STRING: 590 ret = Op.createString(tok.getString()); 591 ret.next = next; 592 break; 593 594 case Token.BACKREFERENCE: 595 ret = Op.createBackReference(tok.getReferenceNumber()); 596 ret.next = next; 597 break; 598 599 case Token.PAREN: 600 if (tok.getParenNumber() == 0) { 601 ret = compile(tok.getChild(0), next, reverse); 602 } else if (reverse) { 603 next = Op.createCapture(tok.getParenNumber(), next); 604 next = compile(tok.getChild(0), next, reverse); 605 ret = Op.createCapture(-tok.getParenNumber(), next); 606 } else { 607 next = Op.createCapture(-tok.getParenNumber(), next); 608 next = compile(tok.getChild(0), next, reverse); 609 ret = Op.createCapture(tok.getParenNumber(), next); 610 } 611 break; 612 613 case Token.LOOKAHEAD: 614 ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false)); 615 break; 616 case Token.NEGATIVELOOKAHEAD: 617 ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false)); 618 break; 619 case Token.LOOKBEHIND: 620 ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true)); 621 break; 622 case Token.NEGATIVELOOKBEHIND: 623 ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true)); 624 break; 625 626 case Token.INDEPENDENT: 627 ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse)); 628 break; 629 630 case Token.MODIFIERGROUP: 631 ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse), 632 ((Token.ModifierToken)tok).getOptions(), 633 ((Token.ModifierToken)tok).getOptionsMask()); 634 break; 635 636 case Token.CONDITION: 637 Token.ConditionToken ctok = (Token.ConditionToken)tok; 638 int ref = ctok.refNumber; 639 Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse); 640 Op yes = compile(ctok.yes, next, reverse); 641 Op no = ctok.no == null ? null : compile(ctok.no, next, reverse); 642 ret = Op.createCondition(next, ref, condition, yes, no); 643 break; 644 645 default: 646 throw new RuntimeException("Unknown token type: "+tok.type); 647 } // switch (tok.type) 648 return ret; 649 } 650 651 652 //Public 653 654 /** 655 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 656 * 657 * @return true if the target is matched to this regular expression. 658 */ 659 public boolean matches(char[] target) { 660 return this.matches(target, 0, target .length , (Match)null); 661 } 662 663 /** 664 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 665 * in specified range or not. 666 * 667 * @param start Start offset of the range. 668 * @param end End offset +1 of the range. 669 * @return true if the target is matched to this regular expression. 670 */ 671 public boolean matches(char[] target, int start, int end) { 672 return this.matches(target, start, end, (Match)null); 673 } 674 675 /** 676 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 677 * 678 * @param match A Match instance for storing matching result. 679 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 680 */ 681 public boolean matches(char[] target, Match match) { 682 return this.matches(target, 0, target .length , match); 683 } 684 685 686 /** 687 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 688 * in specified range or not. 689 * 690 * @param start Start offset of the range. 691 * @param end End offset +1 of the range. 692 * @param match A Match instance for storing matching result. 693 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 694 */ 695 public boolean matches(char[] target, int start, int end, Match match) { 696 697 synchronized (this) { 698 if (this.operations == null) 699 this.prepare(); 700 if (this.context == null) 701 this.context = new Context(); 702 } 703 Context con = null; 704 synchronized (this.context) { 705 con = this.context.inuse ? new Context() : this.context; 706 con.reset(target, start, end, this.numberOfClosures); 707 } 708 if (match != null) { 709 match.setNumberOfGroups(this.nofparen); 710 match.setSource(target); 711 } else if (this.hasBackReferences) { 712 match = new Match(); 713 match.setNumberOfGroups(this.nofparen); 714 // Need not to call setSource() because 715 // a caller can not access this match instance. 716 } 717 con.match = match; 718 719 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 720 int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options); 721 //System.err.println("DEBUG: matchEnd="+matchEnd); 722 if (matchEnd == con.limit) { 723 if (con.match != null) { 724 con.match.setBeginning(0, con.start); 725 con.match.setEnd(0, matchEnd); 726 } 727 con.inuse = false; 728 return true; 729 } 730 return false; 731 } 732 733 /* 734 * The pattern has only fixed string. 735 * The engine uses Boyer-Moore. 736 */ 737 if (this.fixedStringOnly) { 738 //System.err.println("DEBUG: fixed-only: "+this.fixedString); 739 int o = this.fixedStringTable.matches(target, con.start, con.limit); 740 if (o >= 0) { 741 if (con.match != null) { 742 con.match.setBeginning(0, o); 743 con.match.setEnd(0, o+this.fixedString.length()); 744 } 745 con.inuse = false; 746 return true; 747 } 748 con.inuse = false; 749 return false; 750 } 751 752 /* 753 * The pattern contains a fixed string. 754 * The engine checks with Boyer-Moore whether the text contains the fixed string or not. 755 * If not, it return with false. 756 */ 757 if (this.fixedString != null) { 758 int o = this.fixedStringTable.matches(target, con.start, con.limit); 759 if (o < 0) { 760 //System.err.println("Non-match in fixed-string search."); 761 con.inuse = false; 762 return false; 763 } 764 } 765 766 int limit = con.limit-this.minlength; 767 int matchStart; 768 int matchEnd = -1; 769 770 /* 771 * Checks whether the expression starts with ".*". 772 */ 773 if (this.operations != null 774 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 775 if (isSet(this.options, SINGLE_LINE)) { 776 matchStart = con.start; 777 matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options); 778 } else { 779 boolean previousIsEOL = true; 780 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 781 int ch = target [ matchStart ] ; 782 if (isEOLChar(ch)) { 783 previousIsEOL = true; 784 } else { 785 if (previousIsEOL) { 786 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, 787 matchStart, 1, this.options))) 788 break; 789 } 790 previousIsEOL = false; 791 } 792 } 793 } 794 } 795 796 /* 797 * Optimization against the first character. 798 */ 799 else if (this.firstChar != null) { 800 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); 801 RangeToken range = this.firstChar; 802 if (RegularExpression.isSet(this.options, IGNORE_CASE)) { 803 range = this.firstChar.getCaseInsensitiveToken(); 804 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 805 int ch = target [ matchStart ] ; 806 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 807 ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] ); 808 if (!range.match(ch)) continue; 809 } else { 810 if (!range.match(ch)) { 811 char ch1 = Character.toUpperCase((char)ch); 812 if (!range.match(ch1)) 813 if (!range.match(Character.toLowerCase(ch1))) 814 continue; 815 } 816 } 817 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, 818 matchStart, 1, this.options))) 819 break; 820 } 821 } else { 822 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 823 int ch = target [ matchStart ] ; 824 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) 825 ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] ); 826 if (!range.match(ch)) continue; 827 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, 828 matchStart, 1, this.options))) 829 break; 830 } 831 } 832 } 833 834 /* 835 * Straightforward matching. 836 */ 837 else { 838 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 839 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options))) 840 break; 841 } 842 } 843 844 if (matchEnd >= 0) { 845 if (con.match != null) { 846 con.match.setBeginning(0, matchStart); 847 con.match.setEnd(0, matchEnd); 848 } 849 con.inuse = false; 850 return true; 851 } else { 852 con.inuse = false; 853 return false; 854 } 855 } 856 857 /** 858 * @return -1 when not match; offset of the end of matched string when match. 859 */ 860 private int matchCharArray (Context con, Op op, int offset, int dx, int opts) { 861 862 char[] target = con.charTarget; 863 864 865 while (true) { 866 if (op == null) 867 return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; 868 if (offset > con.limit || offset < con.start) 869 return -1; 870 switch (op.type) { 871 case Op.CHAR: 872 if (isSet(opts, IGNORE_CASE)) { 873 int ch = op.getData(); 874 if (dx > 0) { 875 if (offset >= con.limit || !matchIgnoreCase(ch, target [ offset ] )) 876 return -1; 877 offset ++; 878 } else { 879 int o1 = offset-1; 880 if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target [ o1 ] )) 881 return -1; 882 offset = o1; 883 } 884 } else { 885 int ch = op.getData(); 886 if (dx > 0) { 887 if (offset >= con.limit || ch != target [ offset ] ) 888 return -1; 889 offset ++; 890 } else { 891 int o1 = offset-1; 892 if (o1 >= con.limit || o1 < 0 || ch != target [ o1 ] ) 893 return -1; 894 offset = o1; 895 } 896 } 897 op = op.next; 898 break; 899 900 case Op.DOT: 901 if (dx > 0) { 902 if (offset >= con.limit) 903 return -1; 904 int ch = target [ offset ] ; 905 if (isSet(opts, SINGLE_LINE)) { 906 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 907 offset ++; 908 } else { 909 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 910 ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] ); 911 if (isEOLChar(ch)) 912 return -1; 913 } 914 offset ++; 915 } else { 916 int o1 = offset-1; 917 if (o1 >= con.limit || o1 < 0) 918 return -1; 919 int ch = target [ o1 ] ; 920 if (isSet(opts, SINGLE_LINE)) { 921 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 922 o1 --; 923 } else { 924 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 925 ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch); 926 if (!isEOLChar(ch)) 927 return -1; 928 } 929 offset = o1; 930 } 931 op = op.next; 932 break; 933 934 case Op.RANGE: 935 case Op.NRANGE: 936 if (dx > 0) { 937 if (offset >= con.limit) 938 return -1; 939 int ch = target [ offset ] ; 940 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 941 ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] ); 942 RangeToken tok = op.getToken(); 943 if (isSet(opts, IGNORE_CASE)) { 944 tok = tok.getCaseInsensitiveToken(); 945 if (!tok.match(ch)) { 946 if (ch >= 0x10000) return -1; 947 char uch; 948 if (!tok.match(uch = Character.toUpperCase((char)ch)) 949 && !tok.match(Character.toLowerCase(uch))) 950 return -1; 951 } 952 } else { 953 if (!tok.match(ch)) return -1; 954 } 955 offset ++; 956 } else { 957 int o1 = offset-1; 958 if (o1 >= con.limit || o1 < 0) 959 return -1; 960 int ch = target [ o1 ] ; 961 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 962 ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch); 963 RangeToken tok = op.getToken(); 964 if (isSet(opts, IGNORE_CASE)) { 965 tok = tok.getCaseInsensitiveToken(); 966 if (!tok.match(ch)) { 967 if (ch >= 0x10000) return -1; 968 char uch; 969 if (!tok.match(uch = Character.toUpperCase((char)ch)) 970 && !tok.match(Character.toLowerCase(uch))) 971 return -1; 972 } 973 } else { 974 if (!tok.match(ch)) return -1; 975 } 976 offset = o1; 977 } 978 op = op.next; 979 break; 980 981 case Op.ANCHOR: 982 boolean go = false; 983 switch (op.getData()) { 984 case '^': 985 if (isSet(opts, MULTIPLE_LINES)) { 986 if (!(offset == con.start 987 || offset > con.start && isEOLChar( target [ offset-1 ] ))) 988 return -1; 989 } else { 990 if (offset != con.start) 991 return -1; 992 } 993 break; 994 995 case '@': // Internal use only. 996 // The @ always matches line beginnings. 997 if (!(offset == con.start 998 || offset > con.start && isEOLChar( target [ offset-1 ] ))) 999 return -1; 1000 break; 1001 1002 case '$': 1003 if (isSet(opts, MULTIPLE_LINES)) { 1004 if (!(offset == con.limit 1005 || offset < con.limit && isEOLChar( target [ offset ] ))) 1006 return -1; 1007 } else { 1008 if (!(offset == con.limit 1009 || offset+1 == con.limit && isEOLChar( target [ offset ] ) 1010 || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN 1011 && target [ offset+1 ] == LINE_FEED)) 1012 return -1; 1013 } 1014 break; 1015 1016 case 'A': 1017 if (offset != con.start) return -1; 1018 break; 1019 1020 case 'Z': 1021 if (!(offset == con.limit 1022 || offset+1 == con.limit && isEOLChar( target [ offset ] ) 1023 || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN 1024 && target [ offset+1 ] == LINE_FEED)) 1025 return -1; 1026 break; 1027 1028 case 'z': 1029 if (offset != con.limit) return -1; 1030 break; 1031 1032 case 'b': 1033 if (con.length == 0) return -1; 1034 { 1035 int after = getWordType(target, con.start, con.limit, offset, opts); 1036 if (after == WT_IGNORE) return -1; 1037 int before = getPreviousWordType(target, con.start, con.limit, offset, opts); 1038 if (after == before) return -1; 1039 } 1040 break; 1041 1042 case 'B': 1043 if (con.length == 0) 1044 go = true; 1045 else { 1046 int after = getWordType(target, con.start, con.limit, offset, opts); 1047 go = after == WT_IGNORE 1048 || after == getPreviousWordType(target, con.start, con.limit, offset, opts); 1049 } 1050 if (!go) return -1; 1051 break; 1052 1053 case '<': 1054 if (con.length == 0 || offset == con.limit) return -1; 1055 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER 1056 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) 1057 return -1; 1058 break; 1059 1060 case '>': 1061 if (con.length == 0 || offset == con.start) return -1; 1062 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER 1063 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) 1064 return -1; 1065 break; 1066 } // switch anchor type 1067 op = op.next; 1068 break; 1069 1070 case Op.BACKREFERENCE: 1071 { 1072 int refno = op.getData(); 1073 if (refno <= 0 || refno >= this.nofparen) 1074 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno); 1075 if (con.match.getBeginning(refno) < 0 1076 || con.match.getEnd(refno) < 0) 1077 return -1; // ******** 1078 int o2 = con.match.getBeginning(refno); 1079 int literallen = con.match.getEnd(refno)-o2; 1080 if (!isSet(opts, IGNORE_CASE)) { 1081 if (dx > 0) { 1082 if (!regionMatches(target, offset, con.limit, o2, literallen)) 1083 return -1; 1084 offset += literallen; 1085 } else { 1086 if (!regionMatches(target, offset-literallen, con.limit, o2, literallen)) 1087 return -1; 1088 offset -= literallen; 1089 } 1090 } else { 1091 if (dx > 0) { 1092 if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen)) 1093 return -1; 1094 offset += literallen; 1095 } else { 1096 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 1097 o2, literallen)) 1098 return -1; 1099 offset -= literallen; 1100 } 1101 } 1102 } 1103 op = op.next; 1104 break; 1105 case Op.STRING: 1106 { 1107 String literal = op.getString(); 1108 int literallen = literal.length(); 1109 if (!isSet(opts, IGNORE_CASE)) { 1110 if (dx > 0) { 1111 if (!regionMatches(target, offset, con.limit, literal, literallen)) 1112 return -1; 1113 offset += literallen; 1114 } else { 1115 if (!regionMatches(target, offset-literallen, con.limit, literal, literallen)) 1116 return -1; 1117 offset -= literallen; 1118 } 1119 } else { 1120 if (dx > 0) { 1121 if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen)) 1122 return -1; 1123 offset += literallen; 1124 } else { 1125 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 1126 literal, literallen)) 1127 return -1; 1128 offset -= literallen; 1129 } 1130 } 1131 } 1132 op = op.next; 1133 break; 1134 1135 case Op.CLOSURE: 1136 { 1137 /* 1138 * Saves current position to avoid 1139 * zero-width repeats. 1140 */ 1141 int id = op.getData(); 1142 if (id >= 0) { 1143 int previousOffset = con.offsets[id]; 1144 if (previousOffset < 0 || previousOffset != offset) { 1145 con.offsets[id] = offset; 1146 } else { 1147 con.offsets[id] = -1; 1148 op = op.next; 1149 break; 1150 } 1151 } 1152 1153 int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts); 1154 if (id >= 0) con.offsets[id] = -1; 1155 if (ret >= 0) return ret; 1156 op = op.next; 1157 } 1158 break; 1159 1160 case Op.QUESTION: 1161 { 1162 int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts); 1163 if (ret >= 0) return ret; 1164 op = op.next; 1165 } 1166 break; 1167 1168 case Op.NONGREEDYCLOSURE: 1169 case Op.NONGREEDYQUESTION: 1170 { 1171 int ret = this. matchCharArray (con, op.next, offset, dx, opts); 1172 if (ret >= 0) return ret; 1173 op = op.getChild(); 1174 } 1175 break; 1176 1177 case Op.UNION: 1178 for (int i = 0; i < op.size(); i ++) { 1179 int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts); 1180 if (DEBUG) { 1181 System.err.println("UNION: "+i+", ret="+ret); 1182 } 1183 if (ret >= 0) return ret; 1184 } 1185 return -1; 1186 1187 case Op.CAPTURE: 1188 int refno = op.getData(); 1189 if (con.match != null && refno > 0) { 1190 int save = con.match.getBeginning(refno); 1191 con.match.setBeginning(refno, offset); 1192 int ret = this. matchCharArray (con, op.next, offset, dx, opts); 1193 if (ret < 0) con.match.setBeginning(refno, save); 1194 return ret; 1195 } else if (con.match != null && refno < 0) { 1196 int index = -refno; 1197 int save = con.match.getEnd(index); 1198 con.match.setEnd(index, offset); 1199 int ret = this. matchCharArray (con, op.next, offset, dx, opts); 1200 if (ret < 0) con.match.setEnd(index, save); 1201 return ret; 1202 } 1203 op = op.next; 1204 break; 1205 1206 case Op.LOOKAHEAD: 1207 if (0 > this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1; 1208 op = op.next; 1209 break; 1210 case Op.NEGATIVELOOKAHEAD: 1211 if (0 <= this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1; 1212 op = op.next; 1213 break; 1214 case Op.LOOKBEHIND: 1215 if (0 > this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1; 1216 op = op.next; 1217 break; 1218 case Op.NEGATIVELOOKBEHIND: 1219 if (0 <= this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1; 1220 op = op.next; 1221 break; 1222 1223 case Op.INDEPENDENT: 1224 { 1225 int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts); 1226 if (ret < 0) return ret; 1227 offset = ret; 1228 op = op.next; 1229 } 1230 break; 1231 1232 case Op.MODIFIER: 1233 { 1234 int localopts = opts; 1235 localopts |= op.getData(); 1236 localopts &= ~op.getData2(); 1237 //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16)); 1238 int ret = this. matchCharArray (con, op.getChild(), offset, dx, localopts); 1239 if (ret < 0) return ret; 1240 offset = ret; 1241 op = op.next; 1242 } 1243 break; 1244 1245 case Op.CONDITION: 1246 { 1247 Op.ConditionOp cop = (Op.ConditionOp)op; 1248 boolean matchp = false; 1249 if (cop.refNumber > 0) { 1250 if (cop.refNumber >= this.nofparen) 1251 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber); 1252 matchp = con.match.getBeginning(cop.refNumber) >= 0 1253 && con.match.getEnd(cop.refNumber) >= 0; 1254 } else { 1255 matchp = 0 <= this. matchCharArray (con, cop.condition, offset, dx, opts); 1256 } 1257 1258 if (matchp) { 1259 op = cop.yes; 1260 } else if (cop.no != null) { 1261 op = cop.no; 1262 } else { 1263 op = cop.next; 1264 } 1265 } 1266 break; 1267 1268 default: 1269 throw new RuntimeException("Unknown operation type: "+op.type); 1270 } // switch (op.type) 1271 } // while 1272 } 1273 1274 private static final int getPreviousWordType(char[] target, int begin, int end, 1275 int offset, int opts) { 1276 int ret = getWordType(target, begin, end, --offset, opts); 1277 while (ret == WT_IGNORE) 1278 ret = getWordType(target, begin, end, --offset, opts); 1279 return ret; 1280 } 1281 1282 private static final int getWordType(char[] target, int begin, int end, 1283 int offset, int opts) { 1284 if (offset < begin || offset >= end) return WT_OTHER; 1285 return getWordType0( target [ offset ] , opts); 1286 } 1287 1288 1289 1290 private static final boolean regionMatches(char[] target, int offset, int limit, 1291 String part, int partlen) { 1292 if (offset < 0) return false; 1293 if (limit-offset < partlen) 1294 return false; 1295 int i = 0; 1296 while (partlen-- > 0) { 1297 if ( target [ offset++ ] != part.charAt(i++)) 1298 return false; 1299 } 1300 return true; 1301 } 1302 1303 private static final boolean regionMatches(char[] target, int offset, int limit, 1304 int offset2, int partlen) { 1305 if (offset < 0) return false; 1306 if (limit-offset < partlen) 1307 return false; 1308 int i = offset2; 1309 while (partlen-- > 0) { 1310 if ( target [ offset++ ] != target [ i++ ] ) 1311 return false; 1312 } 1313 return true; 1314 } 1315 1316 /** 1317 * @see java.lang.String#regionMatches 1318 */ 1319 private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit, 1320 String part, int partlen) { 1321 if (offset < 0) return false; 1322 if (limit-offset < partlen) 1323 return false; 1324 int i = 0; 1325 while (partlen-- > 0) { 1326 char ch1 = target [ offset++ ] ; 1327 char ch2 = part.charAt(i++); 1328 if (ch1 == ch2) 1329 continue; 1330 char uch1 = Character.toUpperCase(ch1); 1331 char uch2 = Character.toUpperCase(ch2); 1332 if (uch1 == uch2) 1333 continue; 1334 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) 1335 return false; 1336 } 1337 return true; 1338 } 1339 1340 private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit, 1341 int offset2, int partlen) { 1342 if (offset < 0) return false; 1343 if (limit-offset < partlen) 1344 return false; 1345 int i = offset2; 1346 while (partlen-- > 0) { 1347 char ch1 = target [ offset++ ] ; 1348 char ch2 = target [ i++ ] ; 1349 if (ch1 == ch2) 1350 continue; 1351 char uch1 = Character.toUpperCase(ch1); 1352 char uch2 = Character.toUpperCase(ch2); 1353 if (uch1 == uch2) 1354 continue; 1355 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) 1356 return false; 1357 } 1358 return true; 1359 } 1360 1361 1362 1363 1364 /** 1365 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 1366 * 1367 * @return true if the target is matched to this regular expression. 1368 */ 1369 public boolean matches(String target) { 1370 return this.matches(target, 0, target .length() , (Match)null); 1371 } 1372 1373 /** 1374 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 1375 * in specified range or not. 1376 * 1377 * @param start Start offset of the range. 1378 * @param end End offset +1 of the range. 1379 * @return true if the target is matched to this regular expression. 1380 */ 1381 public boolean matches(String target, int start, int end) { 1382 return this.matches(target, start, end, (Match)null); 1383 } 1384 1385 /** 1386 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 1387 * 1388 * @param match A Match instance for storing matching result. 1389 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 1390 */ 1391 public boolean matches(String target, Match match) { 1392 return this.matches(target, 0, target .length() , match); 1393 } 1394 1395 /** 1396 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 1397 * in specified range or not. 1398 * 1399 * @param start Start offset of the range. 1400 * @param end End offset +1 of the range. 1401 * @param match A Match instance for storing matching result. 1402 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 1403 */ 1404 public boolean matches(String target, int start, int end, Match match) { 1405 1406 synchronized (this) { 1407 if (this.operations == null) 1408 this.prepare(); 1409 if (this.context == null) 1410 this.context = new Context(); 1411 } 1412 Context con = null; 1413 synchronized (this.context) { 1414 con = this.context.inuse ? new Context() : this.context; 1415 con.reset(target, start, end, this.numberOfClosures); 1416 } 1417 if (match != null) { 1418 match.setNumberOfGroups(this.nofparen); 1419 match.setSource(target); 1420 } else if (this.hasBackReferences) { 1421 match = new Match(); 1422 match.setNumberOfGroups(this.nofparen); 1423 // Need not to call setSource() because 1424 // a caller can not access this match instance. 1425 } 1426 con.match = match; 1427 1428 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 1429 if (DEBUG) { 1430 System.err.println("target string="+target); 1431 } 1432 int matchEnd = this. matchString (con, this.operations, con.start, 1, this.options); 1433 if (DEBUG) { 1434 System.err.println("matchEnd="+matchEnd); 1435 System.err.println("con.limit="+con.limit); 1436 } 1437 if (matchEnd == con.limit) { 1438 if (con.match != null) { 1439 con.match.setBeginning(0, con.start); 1440 con.match.setEnd(0, matchEnd); 1441 } 1442 con.inuse = false; 1443 return true; 1444 } 1445 return false; 1446 } 1447 1448 /* 1449 * The pattern has only fixed string. 1450 * The engine uses Boyer-Moore. 1451 */ 1452 if (this.fixedStringOnly) { 1453 //System.err.println("DEBUG: fixed-only: "+this.fixedString); 1454 int o = this.fixedStringTable.matches(target, con.start, con.limit); 1455 if (o >= 0) { 1456 if (con.match != null) { 1457 con.match.setBeginning(0, o); 1458 con.match.setEnd(0, o+this.fixedString.length()); 1459 } 1460 con.inuse = false; 1461 return true; 1462 } 1463 con.inuse = false; 1464 return false; 1465 } 1466 1467 /* 1468 * The pattern contains a fixed string. 1469 * The engine checks with Boyer-Moore whether the text contains the fixed string or not. 1470 * If not, it return with false. 1471 */ 1472 if (this.fixedString != null) { 1473 int o = this.fixedStringTable.matches(target, con.start, con.limit); 1474 if (o < 0) { 1475 //System.err.println("Non-match in fixed-string search."); 1476 con.inuse = false; 1477 return false; 1478 } 1479 } 1480 1481 int limit = con.limit-this.minlength; 1482 int matchStart; 1483 int matchEnd = -1; 1484 1485 /* 1486 * Checks whether the expression starts with ".*". 1487 */ 1488 if (this.operations != null 1489 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 1490 if (isSet(this.options, SINGLE_LINE)) { 1491 matchStart = con.start; 1492 matchEnd = this. matchString (con, this.operations, con.start, 1, this.options); 1493 } else { 1494 boolean previousIsEOL = true; 1495 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1496 int ch = target .charAt( matchStart ) ; 1497 if (isEOLChar(ch)) { 1498 previousIsEOL = true; 1499 } else { 1500 if (previousIsEOL) { 1501 if (0 <= (matchEnd = this. matchString (con, this.operations, 1502 matchStart, 1, this.options))) 1503 break; 1504 } 1505 previousIsEOL = false; 1506 } 1507 } 1508 } 1509 } 1510 1511 /* 1512 * Optimization against the first character. 1513 */ 1514 else if (this.firstChar != null) { 1515 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); 1516 RangeToken range = this.firstChar; 1517 if (RegularExpression.isSet(this.options, IGNORE_CASE)) { 1518 range = this.firstChar.getCaseInsensitiveToken(); 1519 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1520 int ch = target .charAt( matchStart ) ; 1521 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 1522 ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) ); 1523 if (!range.match(ch)) continue; 1524 } else { 1525 if (!range.match(ch)) { 1526 char ch1 = Character.toUpperCase((char)ch); 1527 if (!range.match(ch1)) 1528 if (!range.match(Character.toLowerCase(ch1))) 1529 continue; 1530 } 1531 } 1532 if (0 <= (matchEnd = this. matchString (con, this.operations, 1533 matchStart, 1, this.options))) 1534 break; 1535 } 1536 } else { 1537 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1538 int ch = target .charAt( matchStart ) ; 1539 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) 1540 ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) ); 1541 if (!range.match(ch)) continue; 1542 if (0 <= (matchEnd = this. matchString (con, this.operations, 1543 matchStart, 1, this.options))) 1544 break; 1545 } 1546 } 1547 } 1548 1549 /* 1550 * Straightforward matching. 1551 */ 1552 else { 1553 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1554 if (0 <= (matchEnd = this. matchString (con, this.operations, matchStart, 1, this.options))) 1555 break; 1556 } 1557 } 1558 1559 if (matchEnd >= 0) { 1560 if (con.match != null) { 1561 con.match.setBeginning(0, matchStart); 1562 con.match.setEnd(0, matchEnd); 1563 } 1564 con.inuse = false; 1565 return true; 1566 } else { 1567 con.inuse = false; 1568 return false; 1569 } 1570 } 1571 1572 /** 1573 * @return -1 when not match; offset of the end of matched string when match. 1574 */ 1575 private int matchString (Context con, Op op, int offset, int dx, int opts) { 1576 1577 1578 1579 1580 String target = con.strTarget; 1581 1582 1583 1584 1585 while (true) { 1586 if (op == null) 1587 return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; 1588 if (offset > con.limit || offset < con.start) 1589 return -1; 1590 switch (op.type) { 1591 case Op.CHAR: 1592 if (isSet(opts, IGNORE_CASE)) { 1593 int ch = op.getData(); 1594 if (dx > 0) { 1595 if (offset >= con.limit || !matchIgnoreCase(ch, target .charAt( offset ) )) 1596 return -1; 1597 offset ++; 1598 } else { 1599 int o1 = offset-1; 1600 if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .charAt( o1 ) )) 1601 return -1; 1602 offset = o1; 1603 } 1604 } else { 1605 int ch = op.getData(); 1606 if (dx > 0) { 1607 if (offset >= con.limit || ch != target .charAt( offset ) ) 1608 return -1; 1609 offset ++; 1610 } else { 1611 int o1 = offset-1; 1612 if (o1 >= con.limit || o1 < 0 || ch != target .charAt( o1 ) ) 1613 return -1; 1614 offset = o1; 1615 } 1616 } 1617 op = op.next; 1618 break; 1619 1620 case Op.DOT: 1621 if (dx > 0) { 1622 if (offset >= con.limit) 1623 return -1; 1624 int ch = target .charAt( offset ) ; 1625 if (isSet(opts, SINGLE_LINE)) { 1626 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 1627 offset ++; 1628 } else { 1629 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 1630 ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) ); 1631 if (isEOLChar(ch)) 1632 return -1; 1633 } 1634 offset ++; 1635 } else { 1636 int o1 = offset-1; 1637 if (o1 >= con.limit || o1 < 0) 1638 return -1; 1639 int ch = target .charAt( o1 ) ; 1640 if (isSet(opts, SINGLE_LINE)) { 1641 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 1642 o1 --; 1643 } else { 1644 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 1645 ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch); 1646 if (!isEOLChar(ch)) 1647 return -1; 1648 } 1649 offset = o1; 1650 } 1651 op = op.next; 1652 break; 1653 1654 case Op.RANGE: 1655 case Op.NRANGE: 1656 if (dx > 0) { 1657 if (offset >= con.limit) 1658 return -1; 1659 int ch = target .charAt( offset ) ; 1660 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 1661 ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) ); 1662 RangeToken tok = op.getToken(); 1663 if (isSet(opts, IGNORE_CASE)) { 1664 tok = tok.getCaseInsensitiveToken(); 1665 if (!tok.match(ch)) { 1666 if (ch >= 0x10000) return -1; 1667 char uch; 1668 if (!tok.match(uch = Character.toUpperCase((char)ch)) 1669 && !tok.match(Character.toLowerCase(uch))) 1670 return -1; 1671 } 1672 } else { 1673 if (!tok.match(ch)) return -1; 1674 } 1675 offset ++; 1676 } else { 1677 int o1 = offset-1; 1678 if (o1 >= con.limit || o1 < 0) 1679 return -1; 1680 int ch = target .charAt( o1 ) ; 1681 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 1682 ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch); 1683 RangeToken tok = op.getToken(); 1684 if (isSet(opts, IGNORE_CASE)) { 1685 tok = tok.getCaseInsensitiveToken(); 1686 if (!tok.match(ch)) { 1687 if (ch >= 0x10000) return -1; 1688 char uch; 1689 if (!tok.match(uch = Character.toUpperCase((char)ch)) 1690 && !tok.match(Character.toLowerCase(uch))) 1691 return -1; 1692 } 1693 } else { 1694 if (!tok.match(ch)) return -1; 1695 } 1696 offset = o1; 1697 } 1698 op = op.next; 1699 break; 1700 1701 case Op.ANCHOR: 1702 boolean go = false; 1703 switch (op.getData()) { 1704 case '^': 1705 if (isSet(opts, MULTIPLE_LINES)) { 1706 if (!(offset == con.start 1707 || offset > con.start && isEOLChar( target .charAt( offset-1 ) ))) 1708 return -1; 1709 } else { 1710 if (offset != con.start) 1711 return -1; 1712 } 1713 break; 1714 1715 case '@': // Internal use only. 1716 // The @ always matches line beginnings. 1717 if (!(offset == con.start 1718 || offset > con.start && isEOLChar( target .charAt( offset-1 ) ))) 1719 return -1; 1720 break; 1721 1722 case '$': 1723 if (isSet(opts, MULTIPLE_LINES)) { 1724 if (!(offset == con.limit 1725 || offset < con.limit && isEOLChar( target .charAt( offset ) ))) 1726 return -1; 1727 } else { 1728 if (!(offset == con.limit 1729 || offset+1 == con.limit && isEOLChar( target .charAt( offset ) ) 1730 || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN 1731 && target .charAt( offset+1 ) == LINE_FEED)) 1732 return -1; 1733 } 1734 break; 1735 1736 case 'A': 1737 if (offset != con.start) return -1; 1738 break; 1739 1740 case 'Z': 1741 if (!(offset == con.limit 1742 || offset+1 == con.limit && isEOLChar( target .charAt( offset ) ) 1743 || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN 1744 && target .charAt( offset+1 ) == LINE_FEED)) 1745 return -1; 1746 break; 1747 1748 case 'z': 1749 if (offset != con.limit) return -1; 1750 break; 1751 1752 case 'b': 1753 if (con.length == 0) return -1; 1754 { 1755 int after = getWordType(target, con.start, con.limit, offset, opts); 1756 if (after == WT_IGNORE) return -1; 1757 int before = getPreviousWordType(target, con.start, con.limit, offset, opts); 1758 if (after == before) return -1; 1759 } 1760 break; 1761 1762 case 'B': 1763 if (con.length == 0) 1764 go = true; 1765 else { 1766 int after = getWordType(target, con.start, con.limit, offset, opts); 1767 go = after == WT_IGNORE 1768 || after == getPreviousWordType(target, con.start, con.limit, offset, opts); 1769 } 1770 if (!go) return -1; 1771 break; 1772 1773 case '<': 1774 if (con.length == 0 || offset == con.limit) return -1; 1775 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER 1776 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) 1777 return -1; 1778 break; 1779 1780 case '>': 1781 if (con.length == 0 || offset == con.start) return -1; 1782 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER 1783 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) 1784 return -1; 1785 break; 1786 } // switch anchor type 1787 op = op.next; 1788 break; 1789 1790 case Op.BACKREFERENCE: 1791 { 1792 int refno = op.getData(); 1793 if (refno <= 0 || refno >= this.nofparen) 1794 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno); 1795 if (con.match.getBeginning(refno) < 0 1796 || con.match.getEnd(refno) < 0) 1797 return -1; // ******** 1798 int o2 = con.match.getBeginning(refno); 1799 int literallen = con.match.getEnd(refno)-o2; 1800 if (!isSet(opts, IGNORE_CASE)) { 1801 if (dx > 0) { 1802 if (!regionMatches(target, offset, con.limit, o2, literallen)) 1803 return -1; 1804 offset += literallen; 1805 } else { 1806 if (!regionMatches(target, offset-literallen, con.limit, o2, literallen)) 1807 return -1; 1808 offset -= literallen; 1809 } 1810 } else { 1811 if (dx > 0) { 1812 if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen)) 1813 return -1; 1814 offset += literallen; 1815 } else { 1816 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 1817 o2, literallen)) 1818 return -1; 1819 offset -= literallen; 1820 } 1821 } 1822 } 1823 op = op.next; 1824 break; 1825 case Op.STRING: 1826 { 1827 String literal = op.getString(); 1828 int literallen = literal.length(); 1829 if (!isSet(opts, IGNORE_CASE)) { 1830 if (dx > 0) { 1831 if (!regionMatches(target, offset, con.limit, literal, literallen)) 1832 return -1; 1833 offset += literallen; 1834 } else { 1835 if (!regionMatches(target, offset-literallen, con.limit, literal, literallen)) 1836 return -1; 1837 offset -= literallen; 1838 } 1839 } else { 1840 if (dx > 0) { 1841 if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen)) 1842 return -1; 1843 offset += literallen; 1844 } else { 1845 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 1846 literal, literallen)) 1847 return -1; 1848 offset -= literallen; 1849 } 1850 } 1851 } 1852 op = op.next; 1853 break; 1854 1855 case Op.CLOSURE: 1856 { 1857 /* 1858 * Saves current position to avoid 1859 * zero-width repeats. 1860 */ 1861 int id = op.getData(); 1862 if (id >= 0) { 1863 int previousOffset = con.offsets[id]; 1864 if (previousOffset < 0 || previousOffset != offset) { 1865 con.offsets[id] = offset; 1866 } else { 1867 con.offsets[id] = -1; 1868 op = op.next; 1869 break; 1870 } 1871 } 1872 int ret = this. matchString (con, op.getChild(), offset, dx, opts); 1873 if (id >= 0) con.offsets[id] = -1; 1874 if (ret >= 0) return ret; 1875 op = op.next; 1876 } 1877 break; 1878 1879 case Op.QUESTION: 1880 { 1881 int ret = this. matchString (con, op.getChild(), offset, dx, opts); 1882 if (ret >= 0) return ret; 1883 op = op.next; 1884 } 1885 break; 1886 1887 case Op.NONGREEDYCLOSURE: 1888 case Op.NONGREEDYQUESTION: 1889 { 1890 int ret = this. matchString (con, op.next, offset, dx, opts); 1891 if (ret >= 0) return ret; 1892 op = op.getChild(); 1893 } 1894 break; 1895 1896 case Op.UNION: 1897 for (int i = 0; i < op.size(); i ++) { 1898 int ret = this. matchString (con, op.elementAt(i), offset, dx, opts); 1899 if (DEBUG) { 1900 System.err.println("UNION: "+i+", ret="+ret); 1901 } 1902 if (ret >= 0) return ret; 1903 } 1904 return -1; 1905 1906 case Op.CAPTURE: 1907 int refno = op.getData(); 1908 if (con.match != null && refno > 0) { 1909 int save = con.match.getBeginning(refno); 1910 con.match.setBeginning(refno, offset); 1911 int ret = this. matchString (con, op.next, offset, dx, opts); 1912 if (ret < 0) con.match.setBeginning(refno, save); 1913 return ret; 1914 } else if (con.match != null && refno < 0) { 1915 int index = -refno; 1916 int save = con.match.getEnd(index); 1917 con.match.setEnd(index, offset); 1918 int ret = this. matchString (con, op.next, offset, dx, opts); 1919 if (ret < 0) con.match.setEnd(index, save); 1920 return ret; 1921 } 1922 op = op.next; 1923 break; 1924 1925 case Op.LOOKAHEAD: 1926 if (0 > this. matchString (con, op.getChild(), offset, 1, opts)) return -1; 1927 op = op.next; 1928 break; 1929 case Op.NEGATIVELOOKAHEAD: 1930 if (0 <= this. matchString (con, op.getChild(), offset, 1, opts)) return -1; 1931 op = op.next; 1932 break; 1933 case Op.LOOKBEHIND: 1934 if (0 > this. matchString (con, op.getChild(), offset, -1, opts)) return -1; 1935 op = op.next; 1936 break; 1937 case Op.NEGATIVELOOKBEHIND: 1938 if (0 <= this. matchString (con, op.getChild(), offset, -1, opts)) return -1; 1939 op = op.next; 1940 break; 1941 1942 case Op.INDEPENDENT: 1943 { 1944 int ret = this. matchString (con, op.getChild(), offset, dx, opts); 1945 if (ret < 0) return ret; 1946 offset = ret; 1947 op = op.next; 1948 } 1949 break; 1950 1951 case Op.MODIFIER: 1952 { 1953 int localopts = opts; 1954 localopts |= op.getData(); 1955 localopts &= ~op.getData2(); 1956 //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16)); 1957 int ret = this. matchString (con, op.getChild(), offset, dx, localopts); 1958 if (ret < 0) return ret; 1959 offset = ret; 1960 op = op.next; 1961 } 1962 break; 1963 1964 case Op.CONDITION: 1965 { 1966 Op.ConditionOp cop = (Op.ConditionOp)op; 1967 boolean matchp = false; 1968 if (cop.refNumber > 0) { 1969 if (cop.refNumber >= this.nofparen) 1970 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber); 1971 matchp = con.match.getBeginning(cop.refNumber) >= 0 1972 && con.match.getEnd(cop.refNumber) >= 0; 1973 } else { 1974 matchp = 0 <= this. matchString (con, cop.condition, offset, dx, opts); 1975 } 1976 1977 if (matchp) { 1978 op = cop.yes; 1979 } else if (cop.no != null) { 1980 op = cop.no; 1981 } else { 1982 op = cop.next; 1983 } 1984 } 1985 break; 1986 1987 default: 1988 throw new RuntimeException("Unknown operation type: "+op.type); 1989 } // switch (op.type) 1990 } // while 1991 } 1992 1993 private static final int getPreviousWordType(String target, int begin, int end, 1994 int offset, int opts) { 1995 int ret = getWordType(target, begin, end, --offset, opts); 1996 while (ret == WT_IGNORE) 1997 ret = getWordType(target, begin, end, --offset, opts); 1998 return ret; 1999 } 2000 2001 private static final int getWordType(String target, int begin, int end, 2002 int offset, int opts) { 2003 if (offset < begin || offset >= end) return WT_OTHER; 2004 return getWordType0( target .charAt( offset ) , opts); 2005 } 2006 2007 2008 private static final boolean regionMatches(String text, int offset, int limit, 2009 String part, int partlen) { 2010 if (limit-offset < partlen) return false; 2011 return text.regionMatches(offset, part, 0, partlen); 2012 } 2013 2014 private static final boolean regionMatches(String text, int offset, int limit, 2015 int offset2, int partlen) { 2016 if (limit-offset < partlen) return false; 2017 return text.regionMatches(offset, text, offset2, partlen); 2018 } 2019 2020 private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit, 2021 String part, int partlen) { 2022 return text.regionMatches(true, offset, part, 0, partlen); 2023 } 2024 2025 private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit, 2026 int offset2, int partlen) { 2027 if (limit-offset < partlen) return false; 2028 return text.regionMatches(true, offset, text, offset2, partlen); 2029 } 2030 2031 2032 2033 2034 2035 2036 2037 /** 2038 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 2039 * 2040 * @return true if the target is matched to this regular expression. 2041 */ 2042 public boolean matches(CharacterIterator target) { 2043 return this.matches(target, (Match)null); 2044 } 2045 2046 2047 /** 2048 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 2049 * 2050 * @param match A Match instance for storing matching result. 2051 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 2052 */ 2053 public boolean matches(CharacterIterator target, Match match) { 2054 int start = target.getBeginIndex(); 2055 int end = target.getEndIndex(); 2056 2057 2058 2059 synchronized (this) { 2060 if (this.operations == null) 2061 this.prepare(); 2062 if (this.context == null) 2063 this.context = new Context(); 2064 } 2065 Context con = null; 2066 synchronized (this.context) { 2067 con = this.context.inuse ? new Context() : this.context; 2068 con.reset(target, start, end, this.numberOfClosures); 2069 } 2070 if (match != null) { 2071 match.setNumberOfGroups(this.nofparen); 2072 match.setSource(target); 2073 } else if (this.hasBackReferences) { 2074 match = new Match(); 2075 match.setNumberOfGroups(this.nofparen); 2076 // Need not to call setSource() because 2077 // a caller can not access this match instance. 2078 } 2079 con.match = match; 2080 2081 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 2082 int matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options); 2083 //System.err.println("DEBUG: matchEnd="+matchEnd); 2084 if (matchEnd == con.limit) { 2085 if (con.match != null) { 2086 con.match.setBeginning(0, con.start); 2087 con.match.setEnd(0, matchEnd); 2088 } 2089 con.inuse = false; 2090 return true; 2091 } 2092 return false; 2093 } 2094 2095 /* 2096 * The pattern has only fixed string. 2097 * The engine uses Boyer-Moore. 2098 */ 2099 if (this.fixedStringOnly) { 2100 //System.err.println("DEBUG: fixed-only: "+this.fixedString); 2101 int o = this.fixedStringTable.matches(target, con.start, con.limit); 2102 if (o >= 0) { 2103 if (con.match != null) { 2104 con.match.setBeginning(0, o); 2105 con.match.setEnd(0, o+this.fixedString.length()); 2106 } 2107 con.inuse = false; 2108 return true; 2109 } 2110 con.inuse = false; 2111 return false; 2112 } 2113 2114 /* 2115 * The pattern contains a fixed string. 2116 * The engine checks with Boyer-Moore whether the text contains the fixed string or not. 2117 * If not, it return with false. 2118 */ 2119 if (this.fixedString != null) { 2120 int o = this.fixedStringTable.matches(target, con.start, con.limit); 2121 if (o < 0) { 2122 //System.err.println("Non-match in fixed-string search."); 2123 con.inuse = false; 2124 return false; 2125 } 2126 } 2127 2128 int limit = con.limit-this.minlength; 2129 int matchStart; 2130 int matchEnd = -1; 2131 2132 /* 2133 * Checks whether the expression starts with ".*". 2134 */ 2135 if (this.operations != null 2136 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 2137 if (isSet(this.options, SINGLE_LINE)) { 2138 matchStart = con.start; 2139 matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options); 2140 } else { 2141 boolean previousIsEOL = true; 2142 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2143 int ch = target .setIndex( matchStart ) ; 2144 if (isEOLChar(ch)) { 2145 previousIsEOL = true; 2146 } else { 2147 if (previousIsEOL) { 2148 if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, 2149 matchStart, 1, this.options))) 2150 break; 2151 } 2152 previousIsEOL = false; 2153 } 2154 } 2155 } 2156 } 2157 2158 /* 2159 * Optimization against the first character. 2160 */ 2161 else if (this.firstChar != null) { 2162 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); 2163 RangeToken range = this.firstChar; 2164 if (RegularExpression.isSet(this.options, IGNORE_CASE)) { 2165 range = this.firstChar.getCaseInsensitiveToken(); 2166 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2167 int ch = target .setIndex( matchStart ) ; 2168 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 2169 ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) ); 2170 if (!range.match(ch)) continue; 2171 } else { 2172 if (!range.match(ch)) { 2173 char ch1 = Character.toUpperCase((char)ch); 2174 if (!range.match(ch1)) 2175 if (!range.match(Character.toLowerCase(ch1))) 2176 continue; 2177 } 2178 } 2179 if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, 2180 matchStart, 1, this.options))) 2181 break; 2182 } 2183 } else { 2184 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2185 int ch = target .setIndex( matchStart ) ; 2186 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) 2187 ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) ); 2188 if (!range.match(ch)) continue; 2189 if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, 2190 matchStart, 1, this.options))) 2191 break; 2192 } 2193 } 2194 } 2195 2196 /* 2197 * Straightforward matching. 2198 */ 2199 else { 2200 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2201 if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, matchStart, 1, this.options))) 2202 break; 2203 } 2204 } 2205 2206 if (matchEnd >= 0) { 2207 if (con.match != null) { 2208 con.match.setBeginning(0, matchStart); 2209 con.match.setEnd(0, matchEnd); 2210 } 2211 con.inuse = false; 2212 return true; 2213 } else { 2214 con.inuse = false; 2215 return false; 2216 } 2217 } 2218 2219 /** 2220 * @return -1 when not match; offset of the end of matched string when match. 2221 */ 2222 private int matchCharacterIterator (Context con, Op op, int offset, int dx, int opts) { 2223 2224 2225 CharacterIterator target = con.ciTarget; 2226 2227 2228 2229 2230 2231 2232 while (true) { 2233 if (op == null) 2234 return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; 2235 if (offset > con.limit || offset < con.start) 2236 return -1; 2237 switch (op.type) { 2238 case Op.CHAR: 2239 if (isSet(opts, IGNORE_CASE)) { 2240 int ch = op.getData(); 2241 if (dx > 0) { 2242 if (offset >= con.limit || !matchIgnoreCase(ch, target .setIndex( offset ) )) 2243 return -1; 2244 offset ++; 2245 } else { 2246 int o1 = offset-1; 2247 if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .setIndex( o1 ) )) 2248 return -1; 2249 offset = o1; 2250 } 2251 } else { 2252 int ch = op.getData(); 2253 if (dx > 0) { 2254 if (offset >= con.limit || ch != target .setIndex( offset ) ) 2255 return -1; 2256 offset ++; 2257 } else { 2258 int o1 = offset-1; 2259 if (o1 >= con.limit || o1 < 0 || ch != target .setIndex( o1 ) ) 2260 return -1; 2261 offset = o1; 2262 } 2263 } 2264 op = op.next; 2265 break; 2266 2267 case Op.DOT: 2268 if (dx > 0) { 2269 if (offset >= con.limit) 2270 return -1; 2271 int ch = target .setIndex( offset ) ; 2272 if (isSet(opts, SINGLE_LINE)) { 2273 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 2274 offset ++; 2275 } else { 2276 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 2277 ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) ); 2278 if (isEOLChar(ch)) 2279 return -1; 2280 } 2281 offset ++; 2282 } else { 2283 int o1 = offset-1; 2284 if (o1 >= con.limit || o1 < 0) 2285 return -1; 2286 int ch = target .setIndex( o1 ) ; 2287 if (isSet(opts, SINGLE_LINE)) { 2288 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 2289 o1 --; 2290 } else { 2291 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 2292 ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch); 2293 if (!isEOLChar(ch)) 2294 return -1; 2295 } 2296 offset = o1; 2297 } 2298 op = op.next; 2299 break; 2300 2301 case Op.RANGE: 2302 case Op.NRANGE: 2303 if (dx > 0) { 2304 if (offset >= con.limit) 2305 return -1; 2306 int ch = target .setIndex( offset ) ; 2307 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 2308 ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) ); 2309 RangeToken tok = op.getToken(); 2310 if (isSet(opts, IGNORE_CASE)) { 2311 tok = tok.getCaseInsensitiveToken(); 2312 if (!tok.match(ch)) { 2313 if (ch >= 0x10000) return -1; 2314 char uch; 2315 if (!tok.match(uch = Character.toUpperCase((char)ch)) 2316 && !tok.match(Character.toLowerCase(uch))) 2317 return -1; 2318 } 2319 } else { 2320 if (!tok.match(ch)) return -1; 2321 } 2322 offset ++; 2323 } else { 2324 int o1 = offset-1; 2325 if (o1 >= con.limit || o1 < 0) 2326 return -1; 2327 int ch = target .setIndex( o1 ) ; 2328 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 2329 ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch); 2330 RangeToken tok = op.getToken(); 2331 if (isSet(opts, IGNORE_CASE)) { 2332 tok = tok.getCaseInsensitiveToken(); 2333 if (!tok.match(ch)) { 2334 if (ch >= 0x10000) return -1; 2335 char uch; 2336 if (!tok.match(uch = Character.toUpperCase((char)ch)) 2337 && !tok.match(Character.toLowerCase(uch))) 2338 return -1; 2339 } 2340 } else { 2341 if (!tok.match(ch)) return -1; 2342 } 2343 offset = o1; 2344 } 2345 op = op.next; 2346 break; 2347 2348 case Op.ANCHOR: 2349 boolean go = false; 2350 switch (op.getData()) { 2351 case '^': 2352 if (isSet(opts, MULTIPLE_LINES)) { 2353 if (!(offset == con.start 2354 || offset > con.start && isEOLChar( target .setIndex( offset-1 ) ))) 2355 return -1; 2356 } else { 2357 if (offset != con.start) 2358 return -1; 2359 } 2360 break; 2361 2362 case '@': // Internal use only. 2363 // The @ always matches line beginnings. 2364 if (!(offset == con.start 2365 || offset > con.start && isEOLChar( target .setIndex( offset-1 ) ))) 2366 return -1; 2367 break; 2368 2369 case '$': 2370 if (isSet(opts, MULTIPLE_LINES)) { 2371 if (!(offset == con.limit 2372 || offset < con.limit && isEOLChar( target .setIndex( offset ) ))) 2373 return -1; 2374 } else { 2375 if (!(offset == con.limit 2376 || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) ) 2377 || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN 2378 && target .setIndex( offset+1 ) == LINE_FEED)) 2379 return -1; 2380 } 2381 break; 2382 2383 case 'A': 2384 if (offset != con.start) return -1; 2385 break; 2386 2387 case 'Z': 2388 if (!(offset == con.limit 2389 || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) ) 2390 || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN 2391 && target .setIndex( offset+1 ) == LINE_FEED)) 2392 return -1; 2393 break; 2394 2395 case 'z': 2396 if (offset != con.limit) return -1; 2397 break; 2398 2399 case 'b': 2400 if (con.length == 0) return -1; 2401 { 2402 int after = getWordType(target, con.start, con.limit, offset, opts); 2403 if (after == WT_IGNORE) return -1; 2404 int before = getPreviousWordType(target, con.start, con.limit, offset, opts); 2405 if (after == before) return -1; 2406 } 2407 break; 2408 2409 case 'B': 2410 if (con.length == 0) 2411 go = true; 2412 else { 2413 int after = getWordType(target, con.start, con.limit, offset, opts); 2414 go = after == WT_IGNORE 2415 || after == getPreviousWordType(target, con.start, con.limit, offset, opts); 2416 } 2417 if (!go) return -1; 2418 break; 2419 2420 case '<': 2421 if (con.length == 0 || offset == con.limit) return -1; 2422 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER 2423 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) 2424 return -1; 2425 break; 2426 2427 case '>': 2428 if (con.length == 0 || offset == con.start) return -1; 2429 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER 2430 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) 2431 return -1; 2432 break; 2433 } // switch anchor type 2434 op = op.next; 2435 break; 2436 2437 case Op.BACKREFERENCE: 2438 { 2439 int refno = op.getData(); 2440 if (refno <= 0 || refno >= this.nofparen) 2441 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno); 2442 if (con.match.getBeginning(refno) < 0 2443 || con.match.getEnd(refno) < 0) 2444 return -1; // ******** 2445 int o2 = con.match.getBeginning(refno); 2446 int literallen = con.match.getEnd(refno)-o2; 2447 if (!isSet(opts, IGNORE_CASE)) { 2448 if (dx > 0) { 2449 if (!regionMatches(target, offset, con.limit, o2, literallen)) 2450 return -1; 2451 offset += literallen; 2452 } else { 2453 if (!regionMatches(target, offset-literallen, con.limit, o2, literallen)) 2454 return -1; 2455 offset -= literallen; 2456 } 2457 } else { 2458 if (dx > 0) { 2459 if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen)) 2460 return -1; 2461 offset += literallen; 2462 } else { 2463 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 2464 o2, literallen)) 2465 return -1; 2466 offset -= literallen; 2467 } 2468 } 2469 } 2470 op = op.next; 2471 break; 2472 case Op.STRING: 2473 { 2474 String literal = op.getString(); 2475 int literallen = literal.length(); 2476 if (!isSet(opts, IGNORE_CASE)) { 2477 if (dx > 0) { 2478 if (!regionMatches(target, offset, con.limit, literal, literallen)) 2479 return -1; 2480 offset += literallen; 2481 } else { 2482 if (!regionMatches(target, offset-literallen, con.limit, literal, literallen)) 2483 return -1; 2484 offset -= literallen; 2485 } 2486 } else { 2487 if (dx > 0) { 2488 if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen)) 2489 return -1; 2490 offset += literallen; 2491 } else { 2492 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 2493 literal, literallen)) 2494 return -1; 2495 offset -= literallen; 2496 } 2497 } 2498 } 2499 op = op.next; 2500 break; 2501 2502 case Op.CLOSURE: 2503 { 2504 /* 2505 * Saves current position to avoid 2506 * zero-width repeats. 2507 */ 2508 int id = op.getData(); 2509 if (id >= 0) { 2510 int previousOffset = con.offsets[id]; 2511 if (previousOffset < 0 || previousOffset != offset) { 2512 con.offsets[id] = offset; 2513 } else { 2514 con.offsets[id] = -1; 2515 op = op.next; 2516 break; 2517 } 2518 } 2519 2520 int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts); 2521 if (id >= 0) con.offsets[id] = -1; 2522 if (ret >= 0) return ret; 2523 op = op.next; 2524 } 2525 break; 2526 2527 case Op.QUESTION: 2528 { 2529 int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts); 2530 if (ret >= 0) return ret; 2531 op = op.next; 2532 } 2533 break; 2534 2535 case Op.NONGREEDYCLOSURE: 2536 case Op.NONGREEDYQUESTION: 2537 { 2538 int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts); 2539 if (ret >= 0) return ret; 2540 op = op.getChild(); 2541 } 2542 break; 2543 2544 case Op.UNION: 2545 for (int i = 0; i < op.size(); i ++) { 2546 int ret = this. matchCharacterIterator (con, op.elementAt(i), offset, dx, opts); 2547 if (DEBUG) { 2548 System.err.println("UNION: "+i+", ret="+ret); 2549 } 2550 if (ret >= 0) return ret; 2551 } 2552 return -1; 2553 2554 case Op.CAPTURE: 2555 int refno = op.getData(); 2556 if (con.match != null && refno > 0) { 2557 int save = con.match.getBeginning(refno); 2558 con.match.setBeginning(refno, offset); 2559 int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts); 2560 if (ret < 0) con.match.setBeginning(refno, save); 2561 return ret; 2562 } else if (con.match != null && refno < 0) { 2563 int index = -refno; 2564 int save = con.match.getEnd(index); 2565 con.match.setEnd(index, offset); 2566 int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts); 2567 if (ret < 0) con.match.setEnd(index, save); 2568 return ret; 2569 } 2570 op = op.next; 2571 break; 2572 2573 case Op.LOOKAHEAD: 2574 if (0 > this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1; 2575 op = op.next; 2576 break; 2577 case Op.NEGATIVELOOKAHEAD: 2578 if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1; 2579 op = op.next; 2580 break; 2581 case Op.LOOKBEHIND: 2582 if (0 > this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1; 2583 op = op.next; 2584 break; 2585 case Op.NEGATIVELOOKBEHIND: 2586 if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1; 2587 op = op.next; 2588 break; 2589 2590 case Op.INDEPENDENT: 2591 { 2592 int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts); 2593 if (ret < 0) return ret; 2594 offset = ret; 2595 op = op.next; 2596 } 2597 break; 2598 2599 case Op.MODIFIER: 2600 { 2601 int localopts = opts; 2602 localopts |= op.getData(); 2603 localopts &= ~op.getData2(); 2604 //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16)); 2605 int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, localopts); 2606 if (ret < 0) return ret; 2607 offset = ret; 2608 op = op.next; 2609 } 2610 break; 2611 2612 case Op.CONDITION: 2613 { 2614 Op.ConditionOp cop = (Op.ConditionOp)op; 2615 boolean matchp = false; 2616 if (cop.refNumber > 0) { 2617 if (cop.refNumber >= this.nofparen) 2618 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber); 2619 matchp = con.match.getBeginning(cop.refNumber) >= 0 2620 && con.match.getEnd(cop.refNumber) >= 0; 2621 } else { 2622 matchp = 0 <= this. matchCharacterIterator (con, cop.condition, offset, dx, opts); 2623 } 2624 2625 if (matchp) { 2626 op = cop.yes; 2627 } else if (cop.no != null) { 2628 op = cop.no; 2629 } else { 2630 op = cop.next; 2631 } 2632 } 2633 break; 2634 2635 default: 2636 throw new RuntimeException("Unknown operation type: "+op.type); 2637 } // switch (op.type) 2638 } // while 2639 } 2640 2641 private static final int getPreviousWordType(CharacterIterator target, int begin, int end, 2642 int offset, int opts) { 2643 int ret = getWordType(target, begin, end, --offset, opts); 2644 while (ret == WT_IGNORE) 2645 ret = getWordType(target, begin, end, --offset, opts); 2646 return ret; 2647 } 2648 2649 private static final int getWordType(CharacterIterator target, int begin, int end, 2650 int offset, int opts) { 2651 if (offset < begin || offset >= end) return WT_OTHER; 2652 return getWordType0( target .setIndex( offset ) , opts); 2653 } 2654 2655 2656 2657 private static final boolean regionMatches(CharacterIterator target, int offset, int limit, 2658 String part, int partlen) { 2659 if (offset < 0) return false; 2660 if (limit-offset < partlen) 2661 return false; 2662 int i = 0; 2663 while (partlen-- > 0) { 2664 if ( target .setIndex( offset++ ) != part.charAt(i++)) 2665 return false; 2666 } 2667 return true; 2668 } 2669 2670 private static final boolean regionMatches(CharacterIterator target, int offset, int limit, 2671 int offset2, int partlen) { 2672 if (offset < 0) return false; 2673 if (limit-offset < partlen) 2674 return false; 2675 int i = offset2; 2676 while (partlen-- > 0) { 2677 if ( target .setIndex( offset++ ) != target .setIndex( i++ ) ) 2678 return false; 2679 } 2680 return true; 2681 } 2682 2683 /** 2684 * @see java.lang.String#regionMatches 2685 */ 2686 private static final boolean regionMatchesIgnoreCase(CharacterIterator target, int offset, int limit, 2687 String part, int partlen) { 2688 if (offset < 0) return false; 2689 if (limit-offset < partlen) 2690 return false; 2691 int i = 0; 2692 while (partlen-- > 0) { 2693 char ch1 = target .setIndex( offset++ ) ; 2694 char ch2 = part.charAt(i++); 2695 if (ch1 == ch2) 2696 continue; 2697 char uch1 = Character.toUpperCase(ch1); 2698 char uch2 = Character.toUpperCase(ch2); 2699 if (uch1 == uch2) 2700 continue; 2701 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) 2702 return false; 2703 } 2704 return true; 2705 } 2706 2707 private static final boolean regionMatchesIgnoreCase(CharacterIterator target, int offset, int limit, 2708 int offset2, int partlen) { 2709 if (offset < 0) return false; 2710 if (limit-offset < partlen) 2711 return false; 2712 int i = offset2; 2713 while (partlen-- > 0) { 2714 char ch1 = target .setIndex( offset++ ) ; 2715 char ch2 = target .setIndex( i++ ) ; 2716 if (ch1 == ch2) 2717 continue; 2718 char uch1 = Character.toUpperCase(ch1); 2719 char uch2 = Character.toUpperCase(ch2); 2720 if (uch1 == uch2) 2721 continue; 2722 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) 2723 return false; 2724 } 2725 return true; 2726 } 2727 2728 2729 2730 2731 // ================================================================ 2732 2733 /** 2734 * A regular expression. 2735 * @serial 2736 */ 2737 String regex; 2738 /** 2739 * @serial 2740 */ 2741 int options; 2742 2743 /** 2744 * The number of parenthesis in the regular expression. 2745 * @serial 2746 */ 2747 int nofparen; 2748 /** 2749 * Internal representation of the regular expression. 2750 * @serial 2751 */ 2752 Token tokentree; 2753 2754 boolean hasBackReferences = false; 2755 2756 transient int minlength; 2757 transient Op operations = null; 2758 transient int numberOfClosures; 2759 transient Context context = null; 2760 transient RangeToken firstChar = null; 2761 2762 transient String fixedString = null; 2763 transient int fixedStringOptions; 2764 transient BMPattern fixedStringTable = null; 2765 transient boolean fixedStringOnly = false; 2766 2767 2768 static final class Context { 2769 CharacterIterator ciTarget; 2770 String strTarget; 2771 char[] charTarget; 2772 int start; 2773 int limit; 2774 int length; 2775 Match match; 2776 boolean inuse = false; 2777 int[] offsets; 2778 2779 Context() { 2780 } 2781 2782 private void resetCommon(int nofclosures) { 2783 this.length = this.limit-this.start; 2784 this.inuse = true; 2785 this.match = null; 2786 if (this.offsets == null || this.offsets.length != nofclosures) 2787 this.offsets = new int[nofclosures]; 2788 for (int i = 0; i < nofclosures; i ++) this.offsets[i] = -1; 2789 } 2790 void reset(CharacterIterator target, int start, int limit, int nofclosures) { 2791 this.ciTarget = target; 2792 this.start = start; 2793 this.limit = limit; 2794 this.resetCommon(nofclosures); 2795 } 2796 void reset(String target, int start, int limit, int nofclosures) { 2797 this.strTarget = target; 2798 this.start = start; 2799 this.limit = limit; 2800 this.resetCommon(nofclosures); 2801 } 2802 void reset(char[] target, int start, int limit, int nofclosures) { 2803 this.charTarget = target; 2804 this.start = start; 2805 this.limit = limit; 2806 this.resetCommon(nofclosures); 2807 } 2808 } 2809 2810 /** 2811 * Prepares for matching. This method is called just before starting matching. 2812 */ 2813 void prepare() { 2814 if (Op.COUNT) Op.nofinstances = 0; 2815 this.compile(this.tokentree); 2816 /* 2817 if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .* 2818 Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@'); 2819 anchor.next = this.operations; 2820 this.operations = anchor; 2821 } 2822 */ 2823 if (Op.COUNT) System.err.println("DEBUG: The number of operations: "+Op.nofinstances); 2824 2825 this.minlength = this.tokentree.getMinLength(); 2826 2827 this.firstChar = null; 2828 if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) 2829 && !isSet(this.options, XMLSCHEMA_MODE)) { 2830 RangeToken firstChar = Token.createRange(); 2831 int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options); 2832 if (fresult == Token.FC_TERMINAL) { 2833 firstChar.compactRanges(); 2834 this.firstChar = firstChar; 2835 if (DEBUG) 2836 System.err.println("DEBUG: Use the first character optimization: "+firstChar); 2837 } 2838 } 2839 2840 if (this.operations != null 2841 && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR) 2842 && this.operations.next == null) { 2843 if (DEBUG) 2844 System.err.print(" *** Only fixed string! *** "); 2845 this.fixedStringOnly = true; 2846 if (this.operations.type == Op.STRING) 2847 this.fixedString = this.operations.getString(); 2848 else if (this.operations.getData() >= 0x10000) { // Op.CHAR 2849 this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData()); 2850 } else { 2851 char[] ac = new char[1]; 2852 ac[0] = (char)this.operations.getData(); 2853 this.fixedString = new String(ac); 2854 } 2855 this.fixedStringOptions = this.options; 2856 this.fixedStringTable = new BMPattern(this.fixedString, 256, 2857 isSet(this.fixedStringOptions, IGNORE_CASE)); 2858 } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION) 2859 && !isSet(this.options, XMLSCHEMA_MODE)) { 2860 Token.FixedStringContainer container = new Token.FixedStringContainer(); 2861 this.tokentree.findFixedString(container, this.options); 2862 this.fixedString = container.token == null ? null : container.token.getString(); 2863 this.fixedStringOptions = container.options; 2864 if (this.fixedString != null && this.fixedString.length() < 2) 2865 this.fixedString = null; 2866 // This pattern has a fixed string of which length is more than one. 2867 if (this.fixedString != null) { 2868 this.fixedStringTable = new BMPattern(this.fixedString, 256, 2869 isSet(this.fixedStringOptions, IGNORE_CASE)); 2870 if (DEBUG) { 2871 System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length() 2872 +"/" //+this.fixedString 2873 +"/"+REUtil.createOptionString(this.fixedStringOptions)); 2874 System.err.print("String: "); 2875 REUtil.dumpString(this.fixedString); 2876 } 2877 } 2878 } 2879 } 2880 2881 /** 2882 * An option. 2883 * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span> 2884 * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span> 2885 * does not capture. 2886 * 2887 * @see #RegularExpression(java.lang.String,int) 2888 * @see #setPattern(java.lang.String,int) 2889 static final int MARK_PARENS = 1<<0; 2890 */ 2891 2892 /** 2893 * "i" 2894 */ 2895 static final int IGNORE_CASE = 1<<1; 2896 2897 /** 2898 * "s" 2899 */ 2900 static final int SINGLE_LINE = 1<<2; 2901 2902 /** 2903 * "m" 2904 */ 2905 static final int MULTIPLE_LINES = 1<<3; 2906 2907 /** 2908 * "x" 2909 */ 2910 static final int EXTENDED_COMMENT = 1<<4; 2911 2912 /** 2913 * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>. 2914 * 2915 * @see #RegularExpression(java.lang.String,int) 2916 * @see #setPattern(java.lang.String,int) 2917 * @see #UNICODE_WORD_BOUNDARY 2918 */ 2919 static final int USE_UNICODE_CATEGORY = 1<<5; // "u" 2920 2921 /** 2922 * An option. 2923 * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \< \></kbd></span>. 2924 * <p>By default, the engine considers a position between a word character 2925 * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character 2926 * is a word boundary. 2927 * <p>By this option, the engine checks word boundaries with the method of 2928 * 'Unicode Regular Expression Guidelines' Revision 4. 2929 * 2930 * @see #RegularExpression(java.lang.String,int) 2931 * @see #setPattern(java.lang.String,int) 2932 */ 2933 static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w" 2934 2935 /** 2936 * "H" 2937 */ 2938 static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7; 2939 /** 2940 * "F" 2941 */ 2942 static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8; 2943 /** 2944 * "X". XML Schema mode. 2945 */ 2946 static final int XMLSCHEMA_MODE = 1<<9; 2947 /** 2948 * ",". 2949 */ 2950 static final int SPECIAL_COMMA = 1<<10; 2951 2952 2953 private static final boolean isSet(int options, int flag) { 2954 return (options & flag) == flag; 2955 } 2956 2957 /** 2958 * Creates a new RegularExpression instance. 2959 * 2960 * @param regex A regular expression 2961 * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. 2962 */ 2963 public RegularExpression(String regex) throws ParseException { 2964 this.setPattern(regex, null); 2965 } 2966 2967 /** 2968 * Creates a new RegularExpression instance with options. 2969 * 2970 * @param regex A regular expression 2971 * @param options A String consisted of "i" "m" "s" "u" "w" "," "X" 2972 * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. 2973 */ 2974 public RegularExpression(String regex, String options) throws ParseException { 2975 this.setPattern(regex, options); 2976 } 2977 2978 RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) { 2979 this.regex = regex; 2980 this.tokentree = tok; 2981 this.nofparen = parens; 2982 this.options = options; 2983 this.hasBackReferences = hasBackReferences; 2984 } 2985 2986 /** 2987 * 2988 */ 2989 public void setPattern(String newPattern) throws ParseException { 2990 this.setPattern(newPattern, this.options); 2991 } 2992 2993 private void setPattern(String newPattern, int options) throws ParseException { 2994 this.regex = newPattern; 2995 this.options = options; 2996 RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE) 2997 ? new ParserForXMLSchema() : new RegexParser(); 2998 this.tokentree = rp.parse(this.regex, this.options); 2999 this.nofparen = rp.parennumber; 3000 this.hasBackReferences = rp.hasBackReferences; 3001 3002 this.operations = null; 3003 this.context = null; 3004 } 3005 /** 3006 * 3007 */ 3008 public void setPattern(String newPattern, String options) throws ParseException { 3009 this.setPattern(newPattern, REUtil.parseOptions(options)); 3010 } 3011 3012 /** 3013 * 3014 */ 3015 public String getPattern() { 3016 return this.regex; 3017 } 3018 3019 /** 3020 * Represents this instence in String. 3021 */ 3022 public String toString() { 3023 return this.tokentree.toString(this.options); 3024 } 3025 3026 /** 3027 * Returns a option string. 3028 * The order of letters in it may be different from a string specified 3029 * in a constructor or <code>setPattern()</code>. 3030 * 3031 * @see #RegularExpression(java.lang.String,java.lang.String) 3032 * @see #setPattern(java.lang.String,java.lang.String) 3033 */ 3034 public String getOptions() { 3035 return REUtil.createOptionString(this.options); 3036 } 3037 3038 /** 3039 * Return true if patterns are the same and the options are equivalent. 3040 */ 3041 public boolean equals(Object obj) { 3042 if (obj == null) return false; 3043 if (!(obj instanceof RegularExpression)) 3044 return false; 3045 RegularExpression r = (RegularExpression)obj; 3046 return this.regex.equals(r.regex) && this.options == r.options; 3047 } 3048 3049 boolean equals(String pattern, int options) { 3050 return this.regex.equals(pattern) && this.options == options; 3051 } 3052 3053 /** 3054 * 3055 */ 3056 public int hashCode() { 3057 return (this.regex+"/"+this.getOptions()).hashCode(); 3058 } 3059 3060 /** 3061 * Return the number of regular expression groups. 3062 * This method returns 1 when the regular expression has no capturing-parenthesis. 3063 * 3064 */ 3065 public int getNumberOfGroups() { 3066 return this.nofparen; 3067 } 3068 3069 // ================================================================ 3070 3071 private static final int WT_IGNORE = 0; 3072 private static final int WT_LETTER = 1; 3073 private static final int WT_OTHER = 2; 3074 private static final int getWordType0(char ch, int opts) { 3075 if (!isSet(opts, UNICODE_WORD_BOUNDARY)) { 3076 if (isSet(opts, USE_UNICODE_CATEGORY)) { 3077 return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER; 3078 } 3079 return isWordChar(ch) ? WT_LETTER : WT_OTHER; 3080 } 3081 3082 switch (Character.getType(ch)) { 3083 case Character.UPPERCASE_LETTER: // L 3084 case Character.LOWERCASE_LETTER: // L 3085 case Character.TITLECASE_LETTER: // L 3086 case Character.MODIFIER_LETTER: // L 3087 case Character.OTHER_LETTER: // L 3088 case Character.LETTER_NUMBER: // N 3089 case Character.DECIMAL_DIGIT_NUMBER: // N 3090 case Character.OTHER_NUMBER: // N 3091 case Character.COMBINING_SPACING_MARK: // Mc 3092 return WT_LETTER; 3093 3094 case Character.FORMAT: // Cf 3095 case Character.NON_SPACING_MARK: // Mn 3096 case Character.ENCLOSING_MARK: // Mc 3097 return WT_IGNORE; 3098 3099 case Character.CONTROL: // Cc 3100 switch (ch) { 3101 case '\t': 3102 case '\n': 3103 case '\u000B': 3104 case '\f': 3105 case '\r': 3106 return WT_OTHER; 3107 default: 3108 return WT_IGNORE; 3109 } 3110 3111 default: 3112 return WT_OTHER; 3113 } 3114 } 3115 3116 // ================================================================ 3117 3118 static final int LINE_FEED = 0x000A; 3119 static final int CARRIAGE_RETURN = 0x000D; 3120 static final int LINE_SEPARATOR = 0x2028; 3121 static final int PARAGRAPH_SEPARATOR = 0x2029; 3122 3123 private static final boolean isEOLChar(int ch) { 3124 return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR 3125 || ch == PARAGRAPH_SEPARATOR; 3126 } 3127 3128 private static final boolean isWordChar(int ch) { // Legacy word characters 3129 if (ch == '_') return true; 3130 if (ch < '0') return false; 3131 if (ch > 'z') return false; 3132 if (ch <= '9') return true; 3133 if (ch < 'A') return false; 3134 if (ch <= 'Z') return true; 3135 if (ch < 'a') return false; 3136 return true; 3137 } 3138 3139 private static final boolean matchIgnoreCase(int chardata, int ch) { 3140 if (chardata == ch) return true; 3141 if (chardata > 0xffff || ch > 0xffff) return false; 3142 char uch1 = Character.toUpperCase((char)chardata); 3143 char uch2 = Character.toUpperCase((char)ch); 3144 if (uch1 == uch2) return true; 3145 return Character.toLowerCase(uch1) == Character.toLowerCase(uch2); 3146 } 3147 }