1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package org.apache.xerces.impl.xpath.regex; 19 20 import java.text.CharacterIterator; 21 22 /** 23 * A regular expression matching engine using Non-deterministic Finite Automaton (NFA). 24 * This engine does not conform to the POSIX regular expression. 25 * 26 * <hr width="50%"> 27 * <h3>How to use</h3> 28 * 29 * <dl> 30 * <dt>A. Standard way 31 * <dd> 32 * <pre> 33 * RegularExpression re = new RegularExpression(<var>regex</var>); 34 * if (re.matches(text)) { ... } 35 * </pre> 36 * 37 * <dt>B. Capturing groups 38 * <dd> 39 * <pre> 40 * RegularExpression re = new RegularExpression(<var>regex</var>); 41 * Match match = new Match(); 42 * if (re.matches(text, match)) { 43 * ... // You can refer captured texts with methods of the <code>Match</code> class. 44 * } 45 * </pre> 46 * 47 * </dl> 48 * 49 * <h4>Case-insensitive matching</h4> 50 * <pre> 51 * RegularExpression re = new RegularExpression(<var>regex</var>, "i"); 52 * if (re.matches(text) >= 0) { ...} 53 * </pre> 54 * 55 * <h4>Options</h4> 56 * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a> 57 * or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>. 58 * This <var>options</var> parameter consists of the following characters. 59 * </p> 60 * <dl> 61 * <dt><a name="I_OPTION"><code>"i"</code></a> 62 * <dd>This option indicates case-insensitive matching. 63 * <dt><a name="M_OPTION"><code>"m"</code></a> 64 * <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text. 65 * <dt><a name="S_OPTION"><code>"s"</code></a> 66 * <dd class="REGEX"><kbd>.</kbd> matches any one character. 67 * <dt><a name="U_OPTION"><code>"u"</code></a> 68 * <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \< \></kbd> as becoming to Unicode. 69 * <dt><a name="W_OPTION"><code>"w"</code></a> 70 * <dd class="REGEX">By this option, <kbd>\b \B \< \></kbd> are processed with the method of 71 * 'Unicode Regular Expression Guidelines' Revision 4. 72 * When "w" and "u" are specified at the same time, 73 * <kbd>\b \B \< \></kbd> are processed for the "w" option. 74 * <dt><a name="COMMA_OPTION"><code>","</code></a> 75 * <dd>The parser treats a comma in a character class as a range separator. 76 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option. 77 * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option. 78 * 79 * <dt><a name="X_OPTION"><code>"X"</code></a> 80 * <dd class="REGEX"> 81 * By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>. 82 * The <code>match()</code> method does not do subsring matching 83 * but entire string matching. 84 * 85 * </dl> 86 * 87 * <hr width="50%"> 88 * <h3>Syntax</h3> 89 * <table border="1" bgcolor="#ddeeff"> 90 * <tr> 91 * <td> 92 * <h4>Differences from the Perl 5 regular expression</h4> 93 * <ul> 94 * <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.) 95 * <li>Supports subtraction, union, and intersection operations for character classes. 96 * <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations), 97 * <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>, 98 * <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>, 99 * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>, 100 * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd> 101 * </ul> 102 * </td> 103 * </tr> 104 * </table> 105 * 106 * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P> 107 * <ul> 108 * <li>Character 109 * <dl> 110 * <dt class="REGEX"><kbd>.</kbd> (A period) 111 * <dd>Matches any one character except the following characters. 112 * <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D), 113 * PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028) 114 * <dd>This expression matches one code point in Unicode. It can match a pair of surrogates. 115 * <dd>When <a href="#S_OPTION">the "s" option</a> is specified, 116 * it matches any character including the above four characters. 117 * 118 * <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd> 119 * <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A), 120 * CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009) 121 * 122 * <dt class="REGEX"><kbd>\c</kbd><var>C</var> 123 * <dd>Matches a control character. 124 * The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>', 125 * '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'. 126 * It matches a control character of which the character code is less than 127 * the character code of the <var>C</var> by 0x0040. 128 * <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A), 129 * and a <kbd>\c[</kbd> matches an ESCAPE (U+001B). 130 * 131 * <dt class="REGEX">a non-meta character 132 * <dd>Matches the character. 133 * 134 * <dt class="REGEX"><KBD>\</KBD> + a meta character 135 * <dd>Matches the meta character. 136 * 137 * <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> 138 * <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode. 139 * You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and 140 * variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>. 141 * 142 * <!-- 143 * <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var> 144 * <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode. 145 * --> 146 * 147 * <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var> 148 * <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode. 149 * 150 * <dt class="REGEX"><kbd>\g</kbd> 151 * <dd>Matches a grapheme. 152 * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd> 153 * 154 * <dt class="REGEX"><kbd>\X</kbd> 155 * <dd class="REGEX">Matches a combining character sequence. 156 * It is equivalent to <kbd>(?:\PM\pM*)</kbd> 157 * </dl> 158 * </li> 159 * 160 * <li>Character class 161 * <dl> 162 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>) 163 + * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>) 164 * <dd>Positive character class. It matches a character in ranges. 165 * <dd><var>R<sub>n</sub></var>: 166 * <ul> 167 * <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>) 168 * <p>This range matches the character. 169 * <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var> 170 * <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and <= <var>C<sub>2</sub></var>'s code point. 171 + * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>, 172 + * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd> 173 * <p>... 174 * <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd> 175 * <p>These expressions specifies the same ranges as the following expressions. 176 * </ul> 177 * <p class="REGEX">Enumerated ranges are merged (union operation). 178 * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd> 179 * 180 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>) 181 * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>) 182 * <dd>Negative character class. It matches a character not in ranges. 183 * 184 * <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd> 185 * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.) 186 * <dd>Subtraction or union or intersection for character classes. 187 * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>. 188 * <dd>The result of this operations is a <u>positive character class</u> 189 * even if an expression includes any negative character classes. 190 * You have to take care on this in case-insensitive matching. 191 * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>, 192 * which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching. 193 * But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because 194 * it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>' 195 * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>. 196 * 197 * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt> 198 * <dd>Character class subtraction for the XML Schema. 199 * You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>. 200 * 201 * <dt class="REGEX"><kbd>\d</kbd> 202 * <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>. 203 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 204 * <span class="REGEX"><kbd>\p{Nd}</kbd></span>. 205 * 206 * <dt class="REGEX"><kbd>\D</kbd> 207 * <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd> 208 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 209 * <span class="REGEX"><kbd>\P{Nd}</kbd></span>. 210 * 211 * <dt class="REGEX"><kbd>\s</kbd> 212 * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd> 213 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 214 * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>. 215 * 216 * <dt class="REGEX"><kbd>\S</kbd> 217 * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd> 218 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 219 * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>. 220 * 221 * <dt class="REGEX"><kbd>\w</kbd> 222 * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd> 223 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 224 * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>. 225 * 226 * <dt class="REGEX"><kbd>\W</kbd> 227 * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd> 228 * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to 229 * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>. 230 * 231 * <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd> 232 * <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>. 233 * The following names are available: 234 * <dl> 235 * <dt>Unicode General Categories: 236 * <dd><kbd> 237 * L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp, 238 * Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So, 239 * </kbd> 240 * <dd>(Currently the Cn category includes U+10000-U+10FFFF characters) 241 * <dt>Unicode Blocks: 242 * <dd><kbd> 243 * Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B, 244 * IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek, 245 * Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati, 246 * Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian, 247 * Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation, 248 * Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols, 249 * Letterlike Symbols, Number Forms, Arrows, Mathematical Operators, 250 * Miscellaneous Technical, Control Pictures, Optical Character Recognition, 251 * Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes, 252 * Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana, 253 * Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun, 254 * Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs, 255 * Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates, 256 * Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms, 257 * Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms, 258 * Small Form Variants, Arabic Presentation Forms-B, Specials, 259 * Halfwidth and Fullwidth Forms 260 * </kbd> 261 * <dt>Others: 262 * <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>) 263 * <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>) 264 * <dd><kbd>UNASSGINED</kbd> 265 * (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>) 266 * </dl> 267 * 268 * <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd> 269 * <dd>Matches one character not in the specified General Category or the specified Block. 270 * </dl> 271 * </li> 272 * 273 * <li>Selection and Quantifier 274 * <dl> 275 * <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR> 276 * <dd>... 277 * 278 * <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD> 279 * <dd>Matches 0 or more <var>X</var>. 280 * 281 * <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD> 282 * <dd>Matches 1 or more <var>X</var>. 283 * 284 * <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD> 285 * <dd>Matches 0 or 1 <var>X</var>. 286 * 287 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd> 288 * <dd>Matches <var>number</var> times. 289 * 290 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd> 291 * <dd>... 292 * 293 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd> 294 * <dd>... 295 * 296 * <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd> 297 * <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd> 298 * <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd> 299 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd> 300 * <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd> 301 * <dd>Non-greedy matching. 302 * </dl> 303 * </li> 304 * 305 * <li>Grouping, Capturing, and Back-reference 306 * <dl> 307 * <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD> 308 * <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>". 309 * If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>", 310 * you have to write "<KBD>(?:foo)+</KBD>". 311 * 312 * <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD> 313 * <dd>Grouping with capturing. 314 * It make a group and applications can know 315 * where in target text a group matched with methods of a <code>Match</code> instance 316 * after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>. 317 * The 0th group means whole of this regular expression. 318 * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis. 319 * 320 * <p>For instance, a regular expression is 321 * "<FONT color=blue><KBD> *([^<:]*) +<([^>]*)> *</KBD></FONT>" 322 * and target text is 323 * "<FONT color=red><KBD>From: TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>": 324 * <ul> 325 * <li><code>Match.getCapturedText(0)</code>: 326 * "<FONT color=red><KBD> TAMURA Kent <kent@trl.ibm.co.jp></KBD></FONT>" 327 * <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>" 328 * <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>" 329 * </ul> 330 * 331 * <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd> 332 * <dd> 333 * 334 * <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd> 335 * <dd>Independent expression group. ................ 336 * 337 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd> 338 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd> 339 * <dd>............................ 340 * <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'. 341 * Note that it can not contain 'u'. 342 * 343 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd> 344 * <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd> 345 * <dd>...... 346 * <dd>These expressions must be at the beginning of a group. 347 * </dl> 348 * </li> 349 * 350 * <li>Anchor 351 * <dl> 352 * <dt class="REGEX"><kbd>\A</kbd> 353 * <dd>Matches the beginnig of the text. 354 * 355 * <dt class="REGEX"><kbd>\Z</kbd> 356 * <dd>Matches the end of the text, or before an EOL character at the end of the text, 357 * or CARRIAGE RETURN + LINE FEED at the end of the text. 358 * 359 * <dt class="REGEX"><kbd>\z</kbd> 360 * <dd>Matches the end of the text. 361 * 362 * <dt class="REGEX"><kbd>^</kbd> 363 * <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>. 364 * <dd>When <a href="#M_OPTION">a "m" option</a> is set, 365 * it matches the beginning of the text, or after one of EOL characters ( 366 * LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028), 367 * PARAGRAPH SEPARATOR (U+2029).) 368 * 369 * <dt class="REGEX"><kbd>$</kbd> 370 * <dd>Matches the end of the text, or before an EOL character at the end of the text, 371 * or CARRIAGE RETURN + LINE FEED at the end of the text. 372 * <dd>When <a href="#M_OPTION">a "m" option</a> is set, 373 * it matches the end of the text, or before an EOL character. 374 * 375 * <dt class="REGEX"><kbd>\b</kbd> 376 * <dd>Matches word boundary. 377 * (See <a href="#W_OPTION">a "w" option</a>) 378 * 379 * <dt class="REGEX"><kbd>\B</kbd> 380 * <dd>Matches non word boundary. 381 * (See <a href="#W_OPTION">a "w" option</a>) 382 * 383 * <dt class="REGEX"><kbd>\<</kbd> 384 * <dd>Matches the beginning of a word. 385 * (See <a href="#W_OPTION">a "w" option</a>) 386 * 387 * <dt class="REGEX"><kbd>\></kbd> 388 * <dd>Matches the end of a word. 389 * (See <a href="#W_OPTION">a "w" option</a>) 390 * </dl> 391 * </li> 392 * <li>Lookahead and lookbehind 393 * <dl> 394 * <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd> 395 * <dd>Lookahead. 396 * 397 * <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd> 398 * <dd>Negative lookahead. 399 * 400 * <dt class="REGEX"><kbd>(?<=</kbd><var>X</var><kbd>)</kbd> 401 * <dd>Lookbehind. 402 * <dd>(Note for text capturing......) 403 * 404 * <dt class="REGEX"><kbd>(?<!</kbd><var>X</var><kbd>)</kbd> 405 * <dd>Negative lookbehind. 406 * </dl> 407 * </li> 408 * 409 * <li>Misc. 410 * <dl> 411 * <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>, 412 * <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd> 413 * <dd>...... 414 * <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd> 415 * <dd>Comment. A comment string consists of characters except '<kbd>)</kbd>'. 416 * You can not write comments in character classes and before quantifiers. 417 * </dl> 418 * </li> 419 * </ul> 420 * 421 * 422 * <hr width="50%"> 423 * <h3>BNF for the regular expression</h3> 424 * <pre> 425 * regex ::= ('(?' options ')')? term ('|' term)* 426 * term ::= factor+ 427 * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )? 428 * | '(?#' [^)]* ')' 429 * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}' 430 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] 431 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X' 432 * | '(?>' regex ')' | '(?' options ':' regex ')' 433 * | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')' 434 * options ::= [imsw]* ('-' [imsw]+)? 435 * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' 436 * looks ::= '(?=' regex ')' | '(?!' regex ')' 437 * | '(?<=' regex ')' | '(?<!' regex ')' 438 * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1 439 * category-block ::= '\' [pP] category-symbol-1 440 * | ('\p{' | '\P{') (category-symbol | block-name 441 * | other-properties) '}' 442 * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S' 443 * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo' 444 * | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No' 445 * | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs' 446 * | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po' 447 * | 'Sm' | 'Sc' | 'Sk' | 'So' 448 * block-name ::= (See above) 449 * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED' 450 * character-1 ::= (any character except meta-characters) 451 * 452 * char-class ::= '[' ranges ']' 453 * | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')' 454 * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+ 455 * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block 456 * | range-char | range-char '-' range-char 457 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2 458 * code-point ::= '\x' hex-char hex-char 459 * | '\x{' hex-char+ '}' 460 * <!-- | '\u005c u' hex-char hex-char hex-char hex-char 461 * --> | '\v' hex-char hex-char hex-char hex-char hex-char hex-char 462 * hex-char ::= [0-9a-fA-F] 463 * character-2 ::= (any character except \[]-,) 464 * </pre> 465 * 466 * <hr width="50%"> 467 * <h3>TODO</h3> 468 * <ul> 469 * <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a> 470 * <ul> 471 * <li>2.4 Canonical Equivalents 472 * <li>Level 3 473 * </ul> 474 * <li>Parsing performance 475 * </ul> 476 * 477 * <hr width="50%"> 478 * 479 * @xerces.internal 480 * 481 * @author TAMURA Kent <kent@trl.ibm.co.jp> 482 * @version $Id: RegularExpression.java 446721 2006-09-15 20:35:34Z mrglavas $ 483 */ 484 public class RegularExpression implements java.io.Serializable { 485 486 private static final long serialVersionUID = 6242499334195006401L; 487 488 static final boolean DEBUG = false; 489 490 /** 491 * Compiles a token tree into an operation flow. 492 */ 493 private synchronized void compile(Token tok) { 494 if (this.operations != null) 495 return; 496 this.numberOfClosures = 0; 497 this.operations = this.compile(tok, null, false); 498 } 499 500 /** 501 * Converts a token to an operation. 502 */ 503 private Op compile(Token tok, Op next, boolean reverse) { 504 Op ret; 505 switch (tok.type) { 506 case Token.DOT: 507 ret = Op.createDot(); 508 ret.next = next; 509 break; 510 511 case Token.CHAR: 512 ret = Op.createChar(tok.getChar()); 513 ret.next = next; 514 break; 515 516 case Token.ANCHOR: 517 ret = Op.createAnchor(tok.getChar()); 518 ret.next = next; 519 break; 520 521 case Token.RANGE: 522 case Token.NRANGE: 523 ret = Op.createRange(tok); 524 ret.next = next; 525 break; 526 527 case Token.CONCAT: 528 ret = next; 529 if (!reverse) { 530 for (int i = tok.size()-1; i >= 0; i --) { 531 ret = compile(tok.getChild(i), ret, false); 532 } 533 } else { 534 for (int i = 0; i < tok.size(); i ++) { 535 ret = compile(tok.getChild(i), ret, true); 536 } 537 } 538 break; 539 540 case Token.UNION: 541 Op.UnionOp uni = Op.createUnion(tok.size()); 542 for (int i = 0; i < tok.size(); i ++) { 543 uni.addElement(compile(tok.getChild(i), next, reverse)); 544 } 545 ret = uni; // ret.next is null. 546 break; 547 548 case Token.CLOSURE: 549 case Token.NONGREEDYCLOSURE: 550 Token child = tok.getChild(0); 551 int min = tok.getMin(); 552 int max = tok.getMax(); 553 if (min >= 0 && min == max) { // {n} 554 ret = next; 555 for (int i = 0; i < min; i ++) { 556 ret = compile(child, ret, reverse); 557 } 558 break; 559 } 560 if (min > 0 && max > 0) 561 max -= min; 562 if (max > 0) { 563 // X{2,6} -> XX(X(X(XX?)?)?)? 564 ret = next; 565 for (int i = 0; i < max; i ++) { 566 Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE); 567 q.next = next; 568 q.setChild(compile(child, ret, reverse)); 569 ret = q; 570 } 571 } else { 572 Op.ChildOp op; 573 if (tok.type == Token.NONGREEDYCLOSURE) { 574 op = Op.createNonGreedyClosure(); 575 } else { // Token.CLOSURE 576 if (child.getMinLength() == 0) 577 op = Op.createClosure(this.numberOfClosures++); 578 else 579 op = Op.createClosure(-1); 580 } 581 op.next = next; 582 op.setChild(compile(child, op, reverse)); 583 ret = op; 584 } 585 if (min > 0) { 586 for (int i = 0; i < min; i ++) { 587 ret = compile(child, ret, reverse); 588 } 589 } 590 break; 591 592 case Token.EMPTY: 593 ret = next; 594 break; 595 596 case Token.STRING: 597 ret = Op.createString(tok.getString()); 598 ret.next = next; 599 break; 600 601 case Token.BACKREFERENCE: 602 ret = Op.createBackReference(tok.getReferenceNumber()); 603 ret.next = next; 604 break; 605 606 case Token.PAREN: 607 if (tok.getParenNumber() == 0) { 608 ret = compile(tok.getChild(0), next, reverse); 609 } else if (reverse) { 610 next = Op.createCapture(tok.getParenNumber(), next); 611 next = compile(tok.getChild(0), next, reverse); 612 ret = Op.createCapture(-tok.getParenNumber(), next); 613 } else { 614 next = Op.createCapture(-tok.getParenNumber(), next); 615 next = compile(tok.getChild(0), next, reverse); 616 ret = Op.createCapture(tok.getParenNumber(), next); 617 } 618 break; 619 620 case Token.LOOKAHEAD: 621 ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false)); 622 break; 623 case Token.NEGATIVELOOKAHEAD: 624 ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false)); 625 break; 626 case Token.LOOKBEHIND: 627 ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true)); 628 break; 629 case Token.NEGATIVELOOKBEHIND: 630 ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true)); 631 break; 632 633 case Token.INDEPENDENT: 634 ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse)); 635 break; 636 637 case Token.MODIFIERGROUP: 638 ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse), 639 ((Token.ModifierToken)tok).getOptions(), 640 ((Token.ModifierToken)tok).getOptionsMask()); 641 break; 642 643 case Token.CONDITION: 644 Token.ConditionToken ctok = (Token.ConditionToken)tok; 645 int ref = ctok.refNumber; 646 Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse); 647 Op yes = compile(ctok.yes, next, reverse); 648 Op no = ctok.no == null ? null : compile(ctok.no, next, reverse); 649 ret = Op.createCondition(next, ref, condition, yes, no); 650 break; 651 652 default: 653 throw new RuntimeException("Unknown token type: "+tok.type); 654 } // switch (tok.type) 655 return ret; 656 } 657 658 659 //Public 660 661 /** 662 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 663 * 664 * @return true if the target is matched to this regular expression. 665 */ 666 public boolean matches(char[] target) { 667 return this.matches(target, 0, target .length , (Match)null); 668 } 669 670 /** 671 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 672 * in specified range or not. 673 * 674 * @param start Start offset of the range. 675 * @param end End offset +1 of the range. 676 * @return true if the target is matched to this regular expression. 677 */ 678 public boolean matches(char[] target, int start, int end) { 679 return this.matches(target, start, end, (Match)null); 680 } 681 682 /** 683 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 684 * 685 * @param match A Match instance for storing matching result. 686 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 687 */ 688 public boolean matches(char[] target, Match match) { 689 return this.matches(target, 0, target .length , match); 690 } 691 692 693 /** 694 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 695 * in specified range or not. 696 * 697 * @param start Start offset of the range. 698 * @param end End offset +1 of the range. 699 * @param match A Match instance for storing matching result. 700 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 701 */ 702 public boolean matches(char[] target, int start, int end, Match match) { 703 704 synchronized (this) { 705 if (this.operations == null) 706 this.prepare(); 707 if (this.context == null) 708 this.context = new Context(); 709 } 710 Context con = null; 711 synchronized (this.context) { 712 con = this.context.inuse ? new Context() : this.context; 713 con.reset(target, start, end, this.numberOfClosures); 714 } 715 if (match != null) { 716 match.setNumberOfGroups(this.nofparen); 717 match.setSource(target); 718 } else if (this.hasBackReferences) { 719 match = new Match(); 720 match.setNumberOfGroups(this.nofparen); 721 // Need not to call setSource() because 722 // a caller can not access this match instance. 723 } 724 con.match = match; 725 726 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 727 int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options); 728 //System.err.println("DEBUG: matchEnd="+matchEnd); 729 if (matchEnd == con.limit) { 730 if (con.match != null) { 731 con.match.setBeginning(0, con.start); 732 con.match.setEnd(0, matchEnd); 733 } 734 con.inuse = false; 735 return true; 736 } 737 return false; 738 } 739 740 /* 741 * The pattern has only fixed string. 742 * The engine uses Boyer-Moore. 743 */ 744 if (this.fixedStringOnly) { 745 //System.err.println("DEBUG: fixed-only: "+this.fixedString); 746 int o = this.fixedStringTable.matches(target, con.start, con.limit); 747 if (o >= 0) { 748 if (con.match != null) { 749 con.match.setBeginning(0, o); 750 con.match.setEnd(0, o+this.fixedString.length()); 751 } 752 con.inuse = false; 753 return true; 754 } 755 con.inuse = false; 756 return false; 757 } 758 759 /* 760 * The pattern contains a fixed string. 761 * The engine checks with Boyer-Moore whether the text contains the fixed string or not. 762 * If not, it return with false. 763 */ 764 if (this.fixedString != null) { 765 int o = this.fixedStringTable.matches(target, con.start, con.limit); 766 if (o < 0) { 767 //System.err.println("Non-match in fixed-string search."); 768 con.inuse = false; 769 return false; 770 } 771 } 772 773 int limit = con.limit-this.minlength; 774 int matchStart; 775 int matchEnd = -1; 776 777 /* 778 * Checks whether the expression starts with ".*". 779 */ 780 if (this.operations != null 781 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 782 if (isSet(this.options, SINGLE_LINE)) { 783 matchStart = con.start; 784 matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options); 785 } else { 786 boolean previousIsEOL = true; 787 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 788 int ch = target [ matchStart ] ; 789 if (isEOLChar(ch)) { 790 previousIsEOL = true; 791 } else { 792 if (previousIsEOL) { 793 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, 794 matchStart, 1, this.options))) 795 break; 796 } 797 previousIsEOL = false; 798 } 799 } 800 } 801 } 802 803 /* 804 * Optimization against the first character. 805 */ 806 else if (this.firstChar != null) { 807 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); 808 RangeToken range = this.firstChar; 809 if (RegularExpression.isSet(this.options, IGNORE_CASE)) { 810 range = this.firstChar.getCaseInsensitiveToken(); 811 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 812 int ch = target [ matchStart ] ; 813 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 814 ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] ); 815 if (!range.match(ch)) continue; 816 } else { 817 if (!range.match(ch)) { 818 char ch1 = Character.toUpperCase((char)ch); 819 if (!range.match(ch1)) 820 if (!range.match(Character.toLowerCase(ch1))) 821 continue; 822 } 823 } 824 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, 825 matchStart, 1, this.options))) 826 break; 827 } 828 } else { 829 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 830 int ch = target [ matchStart ] ; 831 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) 832 ch = REUtil.composeFromSurrogates(ch, target [ matchStart+1 ] ); 833 if (!range.match(ch)) continue; 834 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, 835 matchStart, 1, this.options))) 836 break; 837 } 838 } 839 } 840 841 /* 842 * Straightforward matching. 843 */ 844 else { 845 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 846 if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options))) 847 break; 848 } 849 } 850 851 if (matchEnd >= 0) { 852 if (con.match != null) { 853 con.match.setBeginning(0, matchStart); 854 con.match.setEnd(0, matchEnd); 855 } 856 con.inuse = false; 857 return true; 858 } else { 859 con.inuse = false; 860 return false; 861 } 862 } 863 864 /** 865 * @return -1 when not match; offset of the end of matched string when match. 866 */ 867 private int matchCharArray (Context con, Op op, int offset, int dx, int opts) { 868 869 char[] target = con.charTarget; 870 871 872 while (true) { 873 if (op == null) 874 return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; 875 if (offset > con.limit || offset < con.start) 876 return -1; 877 switch (op.type) { 878 case Op.CHAR: 879 if (isSet(opts, IGNORE_CASE)) { 880 int ch = op.getData(); 881 if (dx > 0) { 882 if (offset >= con.limit || !matchIgnoreCase(ch, target [ offset ] )) 883 return -1; 884 offset ++; 885 } else { 886 int o1 = offset-1; 887 if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target [ o1 ] )) 888 return -1; 889 offset = o1; 890 } 891 } else { 892 int ch = op.getData(); 893 if (dx > 0) { 894 if (offset >= con.limit || ch != target [ offset ] ) 895 return -1; 896 offset ++; 897 } else { 898 int o1 = offset-1; 899 if (o1 >= con.limit || o1 < 0 || ch != target [ o1 ] ) 900 return -1; 901 offset = o1; 902 } 903 } 904 op = op.next; 905 break; 906 907 case Op.DOT: 908 if (dx > 0) { 909 if (offset >= con.limit) 910 return -1; 911 int ch = target [ offset ] ; 912 if (isSet(opts, SINGLE_LINE)) { 913 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 914 offset ++; 915 } else { 916 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 917 ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] ); 918 if (isEOLChar(ch)) 919 return -1; 920 } 921 offset ++; 922 } else { 923 int o1 = offset-1; 924 if (o1 >= con.limit || o1 < 0) 925 return -1; 926 int ch = target [ o1 ] ; 927 if (isSet(opts, SINGLE_LINE)) { 928 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 929 o1 --; 930 } else { 931 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 932 ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch); 933 if (!isEOLChar(ch)) 934 return -1; 935 } 936 offset = o1; 937 } 938 op = op.next; 939 break; 940 941 case Op.RANGE: 942 case Op.NRANGE: 943 if (dx > 0) { 944 if (offset >= con.limit) 945 return -1; 946 int ch = target [ offset ] ; 947 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 948 ch = REUtil.composeFromSurrogates(ch, target [ ++offset ] ); 949 RangeToken tok = op.getToken(); 950 if (isSet(opts, IGNORE_CASE)) { 951 tok = tok.getCaseInsensitiveToken(); 952 if (!tok.match(ch)) { 953 if (ch >= 0x10000) return -1; 954 char uch; 955 if (!tok.match(uch = Character.toUpperCase((char)ch)) 956 && !tok.match(Character.toLowerCase(uch))) 957 return -1; 958 } 959 } else { 960 if (!tok.match(ch)) return -1; 961 } 962 offset ++; 963 } else { 964 int o1 = offset-1; 965 if (o1 >= con.limit || o1 < 0) 966 return -1; 967 int ch = target [ o1 ] ; 968 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 969 ch = REUtil.composeFromSurrogates( target [ --o1 ] , ch); 970 RangeToken tok = op.getToken(); 971 if (isSet(opts, IGNORE_CASE)) { 972 tok = tok.getCaseInsensitiveToken(); 973 if (!tok.match(ch)) { 974 if (ch >= 0x10000) return -1; 975 char uch; 976 if (!tok.match(uch = Character.toUpperCase((char)ch)) 977 && !tok.match(Character.toLowerCase(uch))) 978 return -1; 979 } 980 } else { 981 if (!tok.match(ch)) return -1; 982 } 983 offset = o1; 984 } 985 op = op.next; 986 break; 987 988 case Op.ANCHOR: 989 boolean go = false; 990 switch (op.getData()) { 991 case '^': 992 if (isSet(opts, MULTIPLE_LINES)) { 993 if (!(offset == con.start 994 || offset > con.start && isEOLChar( target [ offset-1 ] ))) 995 return -1; 996 } else { 997 if (offset != con.start) 998 return -1; 999 } 1000 break; 1001 1002 case '@': // Internal use only. 1003 // The @ always matches line beginnings. 1004 if (!(offset == con.start 1005 || offset > con.start && isEOLChar( target [ offset-1 ] ))) 1006 return -1; 1007 break; 1008 1009 case '$': 1010 if (isSet(opts, MULTIPLE_LINES)) { 1011 if (!(offset == con.limit 1012 || offset < con.limit && isEOLChar( target [ offset ] ))) 1013 return -1; 1014 } else { 1015 if (!(offset == con.limit 1016 || offset+1 == con.limit && isEOLChar( target [ offset ] ) 1017 || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN 1018 && target [ offset+1 ] == LINE_FEED)) 1019 return -1; 1020 } 1021 break; 1022 1023 case 'A': 1024 if (offset != con.start) return -1; 1025 break; 1026 1027 case 'Z': 1028 if (!(offset == con.limit 1029 || offset+1 == con.limit && isEOLChar( target [ offset ] ) 1030 || offset+2 == con.limit && target [ offset ] == CARRIAGE_RETURN 1031 && target [ offset+1 ] == LINE_FEED)) 1032 return -1; 1033 break; 1034 1035 case 'z': 1036 if (offset != con.limit) return -1; 1037 break; 1038 1039 case 'b': 1040 if (con.length == 0) return -1; 1041 { 1042 int after = getWordType(target, con.start, con.limit, offset, opts); 1043 if (after == WT_IGNORE) return -1; 1044 int before = getPreviousWordType(target, con.start, con.limit, offset, opts); 1045 if (after == before) return -1; 1046 } 1047 break; 1048 1049 case 'B': 1050 if (con.length == 0) 1051 go = true; 1052 else { 1053 int after = getWordType(target, con.start, con.limit, offset, opts); 1054 go = after == WT_IGNORE 1055 || after == getPreviousWordType(target, con.start, con.limit, offset, opts); 1056 } 1057 if (!go) return -1; 1058 break; 1059 1060 case '<': 1061 if (con.length == 0 || offset == con.limit) return -1; 1062 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER 1063 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) 1064 return -1; 1065 break; 1066 1067 case '>': 1068 if (con.length == 0 || offset == con.start) return -1; 1069 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER 1070 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) 1071 return -1; 1072 break; 1073 } // switch anchor type 1074 op = op.next; 1075 break; 1076 1077 case Op.BACKREFERENCE: 1078 { 1079 int refno = op.getData(); 1080 if (refno <= 0 || refno >= this.nofparen) 1081 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno); 1082 if (con.match.getBeginning(refno) < 0 1083 || con.match.getEnd(refno) < 0) 1084 return -1; // ******** 1085 int o2 = con.match.getBeginning(refno); 1086 int literallen = con.match.getEnd(refno)-o2; 1087 if (!isSet(opts, IGNORE_CASE)) { 1088 if (dx > 0) { 1089 if (!regionMatches(target, offset, con.limit, o2, literallen)) 1090 return -1; 1091 offset += literallen; 1092 } else { 1093 if (!regionMatches(target, offset-literallen, con.limit, o2, literallen)) 1094 return -1; 1095 offset -= literallen; 1096 } 1097 } else { 1098 if (dx > 0) { 1099 if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen)) 1100 return -1; 1101 offset += literallen; 1102 } else { 1103 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 1104 o2, literallen)) 1105 return -1; 1106 offset -= literallen; 1107 } 1108 } 1109 } 1110 op = op.next; 1111 break; 1112 case Op.STRING: 1113 { 1114 String literal = op.getString(); 1115 int literallen = literal.length(); 1116 if (!isSet(opts, IGNORE_CASE)) { 1117 if (dx > 0) { 1118 if (!regionMatches(target, offset, con.limit, literal, literallen)) 1119 return -1; 1120 offset += literallen; 1121 } else { 1122 if (!regionMatches(target, offset-literallen, con.limit, literal, literallen)) 1123 return -1; 1124 offset -= literallen; 1125 } 1126 } else { 1127 if (dx > 0) { 1128 if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen)) 1129 return -1; 1130 offset += literallen; 1131 } else { 1132 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 1133 literal, literallen)) 1134 return -1; 1135 offset -= literallen; 1136 } 1137 } 1138 } 1139 op = op.next; 1140 break; 1141 1142 case Op.CLOSURE: 1143 { 1144 /* 1145 * Saves current position to avoid 1146 * zero-width repeats. 1147 */ 1148 int id = op.getData(); 1149 if (id >= 0) { 1150 int previousOffset = con.offsets[id]; 1151 if (previousOffset < 0 || previousOffset != offset) { 1152 con.offsets[id] = offset; 1153 } else { 1154 con.offsets[id] = -1; 1155 op = op.next; 1156 break; 1157 } 1158 } 1159 1160 int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts); 1161 if (id >= 0) con.offsets[id] = -1; 1162 if (ret >= 0) return ret; 1163 op = op.next; 1164 } 1165 break; 1166 1167 case Op.QUESTION: 1168 { 1169 int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts); 1170 if (ret >= 0) return ret; 1171 op = op.next; 1172 } 1173 break; 1174 1175 case Op.NONGREEDYCLOSURE: 1176 case Op.NONGREEDYQUESTION: 1177 { 1178 int ret = this. matchCharArray (con, op.next, offset, dx, opts); 1179 if (ret >= 0) return ret; 1180 op = op.getChild(); 1181 } 1182 break; 1183 1184 case Op.UNION: 1185 for (int i = 0; i < op.size(); i ++) { 1186 int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts); 1187 if (DEBUG) { 1188 System.err.println("UNION: "+i+", ret="+ret); 1189 } 1190 if (ret >= 0) return ret; 1191 } 1192 return -1; 1193 1194 case Op.CAPTURE: 1195 int refno = op.getData(); 1196 if (con.match != null && refno > 0) { 1197 int save = con.match.getBeginning(refno); 1198 con.match.setBeginning(refno, offset); 1199 int ret = this. matchCharArray (con, op.next, offset, dx, opts); 1200 if (ret < 0) con.match.setBeginning(refno, save); 1201 return ret; 1202 } else if (con.match != null && refno < 0) { 1203 int index = -refno; 1204 int save = con.match.getEnd(index); 1205 con.match.setEnd(index, offset); 1206 int ret = this. matchCharArray (con, op.next, offset, dx, opts); 1207 if (ret < 0) con.match.setEnd(index, save); 1208 return ret; 1209 } 1210 op = op.next; 1211 break; 1212 1213 case Op.LOOKAHEAD: 1214 if (0 > this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1; 1215 op = op.next; 1216 break; 1217 case Op.NEGATIVELOOKAHEAD: 1218 if (0 <= this. matchCharArray (con, op.getChild(), offset, 1, opts)) return -1; 1219 op = op.next; 1220 break; 1221 case Op.LOOKBEHIND: 1222 if (0 > this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1; 1223 op = op.next; 1224 break; 1225 case Op.NEGATIVELOOKBEHIND: 1226 if (0 <= this. matchCharArray (con, op.getChild(), offset, -1, opts)) return -1; 1227 op = op.next; 1228 break; 1229 1230 case Op.INDEPENDENT: 1231 { 1232 int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts); 1233 if (ret < 0) return ret; 1234 offset = ret; 1235 op = op.next; 1236 } 1237 break; 1238 1239 case Op.MODIFIER: 1240 { 1241 int localopts = opts; 1242 localopts |= op.getData(); 1243 localopts &= ~op.getData2(); 1244 //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16)); 1245 int ret = this. matchCharArray (con, op.getChild(), offset, dx, localopts); 1246 if (ret < 0) return ret; 1247 offset = ret; 1248 op = op.next; 1249 } 1250 break; 1251 1252 case Op.CONDITION: 1253 { 1254 Op.ConditionOp cop = (Op.ConditionOp)op; 1255 boolean matchp = false; 1256 if (cop.refNumber > 0) { 1257 if (cop.refNumber >= this.nofparen) 1258 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber); 1259 matchp = con.match.getBeginning(cop.refNumber) >= 0 1260 && con.match.getEnd(cop.refNumber) >= 0; 1261 } else { 1262 matchp = 0 <= this. matchCharArray (con, cop.condition, offset, dx, opts); 1263 } 1264 1265 if (matchp) { 1266 op = cop.yes; 1267 } else if (cop.no != null) { 1268 op = cop.no; 1269 } else { 1270 op = cop.next; 1271 } 1272 } 1273 break; 1274 1275 default: 1276 throw new RuntimeException("Unknown operation type: "+op.type); 1277 } // switch (op.type) 1278 } // while 1279 } 1280 1281 private static final int getPreviousWordType(char[] target, int begin, int end, 1282 int offset, int opts) { 1283 int ret = getWordType(target, begin, end, --offset, opts); 1284 while (ret == WT_IGNORE) 1285 ret = getWordType(target, begin, end, --offset, opts); 1286 return ret; 1287 } 1288 1289 private static final int getWordType(char[] target, int begin, int end, 1290 int offset, int opts) { 1291 if (offset < begin || offset >= end) return WT_OTHER; 1292 return getWordType0( target [ offset ] , opts); 1293 } 1294 1295 1296 1297 private static final boolean regionMatches(char[] target, int offset, int limit, 1298 String part, int partlen) { 1299 if (offset < 0) return false; 1300 if (limit-offset < partlen) 1301 return false; 1302 int i = 0; 1303 while (partlen-- > 0) { 1304 if ( target [ offset++ ] != part.charAt(i++)) 1305 return false; 1306 } 1307 return true; 1308 } 1309 1310 private static final boolean regionMatches(char[] target, int offset, int limit, 1311 int offset2, int partlen) { 1312 if (offset < 0) return false; 1313 if (limit-offset < partlen) 1314 return false; 1315 int i = offset2; 1316 while (partlen-- > 0) { 1317 if ( target [ offset++ ] != target [ i++ ] ) 1318 return false; 1319 } 1320 return true; 1321 } 1322 1323 /** 1324 * @see java.lang.String#regionMatches 1325 */ 1326 private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit, 1327 String part, int partlen) { 1328 if (offset < 0) return false; 1329 if (limit-offset < partlen) 1330 return false; 1331 int i = 0; 1332 while (partlen-- > 0) { 1333 char ch1 = target [ offset++ ] ; 1334 char ch2 = part.charAt(i++); 1335 if (ch1 == ch2) 1336 continue; 1337 char uch1 = Character.toUpperCase(ch1); 1338 char uch2 = Character.toUpperCase(ch2); 1339 if (uch1 == uch2) 1340 continue; 1341 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) 1342 return false; 1343 } 1344 return true; 1345 } 1346 1347 private static final boolean regionMatchesIgnoreCase(char[] target, int offset, int limit, 1348 int offset2, int partlen) { 1349 if (offset < 0) return false; 1350 if (limit-offset < partlen) 1351 return false; 1352 int i = offset2; 1353 while (partlen-- > 0) { 1354 char ch1 = target [ offset++ ] ; 1355 char ch2 = target [ i++ ] ; 1356 if (ch1 == ch2) 1357 continue; 1358 char uch1 = Character.toUpperCase(ch1); 1359 char uch2 = Character.toUpperCase(ch2); 1360 if (uch1 == uch2) 1361 continue; 1362 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) 1363 return false; 1364 } 1365 return true; 1366 } 1367 1368 1369 1370 1371 /** 1372 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 1373 * 1374 * @return true if the target is matched to this regular expression. 1375 */ 1376 public boolean matches(String target) { 1377 return this.matches(target, 0, target .length() , (Match)null); 1378 } 1379 1380 /** 1381 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 1382 * in specified range or not. 1383 * 1384 * @param start Start offset of the range. 1385 * @param end End offset +1 of the range. 1386 * @return true if the target is matched to this regular expression. 1387 */ 1388 public boolean matches(String target, int start, int end) { 1389 return this.matches(target, start, end, (Match)null); 1390 } 1391 1392 /** 1393 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 1394 * 1395 * @param match A Match instance for storing matching result. 1396 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 1397 */ 1398 public boolean matches(String target, Match match) { 1399 return this.matches(target, 0, target .length() , match); 1400 } 1401 1402 /** 1403 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern 1404 * in specified range or not. 1405 * 1406 * @param start Start offset of the range. 1407 * @param end End offset +1 of the range. 1408 * @param match A Match instance for storing matching result. 1409 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 1410 */ 1411 public boolean matches(String target, int start, int end, Match match) { 1412 1413 synchronized (this) { 1414 if (this.operations == null) 1415 this.prepare(); 1416 if (this.context == null) 1417 this.context = new Context(); 1418 } 1419 Context con = null; 1420 synchronized (this.context) { 1421 con = this.context.inuse ? new Context() : this.context; 1422 con.reset(target, start, end, this.numberOfClosures); 1423 } 1424 if (match != null) { 1425 match.setNumberOfGroups(this.nofparen); 1426 match.setSource(target); 1427 } else if (this.hasBackReferences) { 1428 match = new Match(); 1429 match.setNumberOfGroups(this.nofparen); 1430 // Need not to call setSource() because 1431 // a caller can not access this match instance. 1432 } 1433 con.match = match; 1434 1435 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 1436 if (DEBUG) { 1437 System.err.println("target string="+target); 1438 } 1439 int matchEnd = this. matchString (con, this.operations, con.start, 1, this.options); 1440 if (DEBUG) { 1441 System.err.println("matchEnd="+matchEnd); 1442 System.err.println("con.limit="+con.limit); 1443 } 1444 if (matchEnd == con.limit) { 1445 if (con.match != null) { 1446 con.match.setBeginning(0, con.start); 1447 con.match.setEnd(0, matchEnd); 1448 } 1449 con.inuse = false; 1450 return true; 1451 } 1452 return false; 1453 } 1454 1455 /* 1456 * The pattern has only fixed string. 1457 * The engine uses Boyer-Moore. 1458 */ 1459 if (this.fixedStringOnly) { 1460 //System.err.println("DEBUG: fixed-only: "+this.fixedString); 1461 int o = this.fixedStringTable.matches(target, con.start, con.limit); 1462 if (o >= 0) { 1463 if (con.match != null) { 1464 con.match.setBeginning(0, o); 1465 con.match.setEnd(0, o+this.fixedString.length()); 1466 } 1467 con.inuse = false; 1468 return true; 1469 } 1470 con.inuse = false; 1471 return false; 1472 } 1473 1474 /* 1475 * The pattern contains a fixed string. 1476 * The engine checks with Boyer-Moore whether the text contains the fixed string or not. 1477 * If not, it return with false. 1478 */ 1479 if (this.fixedString != null) { 1480 int o = this.fixedStringTable.matches(target, con.start, con.limit); 1481 if (o < 0) { 1482 //System.err.println("Non-match in fixed-string search."); 1483 con.inuse = false; 1484 return false; 1485 } 1486 } 1487 1488 int limit = con.limit-this.minlength; 1489 int matchStart; 1490 int matchEnd = -1; 1491 1492 /* 1493 * Checks whether the expression starts with ".*". 1494 */ 1495 if (this.operations != null 1496 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 1497 if (isSet(this.options, SINGLE_LINE)) { 1498 matchStart = con.start; 1499 matchEnd = this. matchString (con, this.operations, con.start, 1, this.options); 1500 } else { 1501 boolean previousIsEOL = true; 1502 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1503 int ch = target .charAt( matchStart ) ; 1504 if (isEOLChar(ch)) { 1505 previousIsEOL = true; 1506 } else { 1507 if (previousIsEOL) { 1508 if (0 <= (matchEnd = this. matchString (con, this.operations, 1509 matchStart, 1, this.options))) 1510 break; 1511 } 1512 previousIsEOL = false; 1513 } 1514 } 1515 } 1516 } 1517 1518 /* 1519 * Optimization against the first character. 1520 */ 1521 else if (this.firstChar != null) { 1522 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); 1523 RangeToken range = this.firstChar; 1524 if (RegularExpression.isSet(this.options, IGNORE_CASE)) { 1525 range = this.firstChar.getCaseInsensitiveToken(); 1526 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1527 int ch = target .charAt( matchStart ) ; 1528 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 1529 ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) ); 1530 if (!range.match(ch)) continue; 1531 } else { 1532 if (!range.match(ch)) { 1533 char ch1 = Character.toUpperCase((char)ch); 1534 if (!range.match(ch1)) 1535 if (!range.match(Character.toLowerCase(ch1))) 1536 continue; 1537 } 1538 } 1539 if (0 <= (matchEnd = this. matchString (con, this.operations, 1540 matchStart, 1, this.options))) 1541 break; 1542 } 1543 } else { 1544 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1545 int ch = target .charAt( matchStart ) ; 1546 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) 1547 ch = REUtil.composeFromSurrogates(ch, target .charAt( matchStart+1 ) ); 1548 if (!range.match(ch)) continue; 1549 if (0 <= (matchEnd = this. matchString (con, this.operations, 1550 matchStart, 1, this.options))) 1551 break; 1552 } 1553 } 1554 } 1555 1556 /* 1557 * Straightforward matching. 1558 */ 1559 else { 1560 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 1561 if (0 <= (matchEnd = this. matchString (con, this.operations, matchStart, 1, this.options))) 1562 break; 1563 } 1564 } 1565 1566 if (matchEnd >= 0) { 1567 if (con.match != null) { 1568 con.match.setBeginning(0, matchStart); 1569 con.match.setEnd(0, matchEnd); 1570 } 1571 con.inuse = false; 1572 return true; 1573 } else { 1574 con.inuse = false; 1575 return false; 1576 } 1577 } 1578 1579 /** 1580 * @return -1 when not match; offset of the end of matched string when match. 1581 */ 1582 private int matchString (Context con, Op op, int offset, int dx, int opts) { 1583 1584 1585 1586 1587 String target = con.strTarget; 1588 1589 1590 1591 1592 while (true) { 1593 if (op == null) 1594 return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; 1595 if (offset > con.limit || offset < con.start) 1596 return -1; 1597 switch (op.type) { 1598 case Op.CHAR: 1599 if (isSet(opts, IGNORE_CASE)) { 1600 int ch = op.getData(); 1601 if (dx > 0) { 1602 if (offset >= con.limit || !matchIgnoreCase(ch, target .charAt( offset ) )) 1603 return -1; 1604 offset ++; 1605 } else { 1606 int o1 = offset-1; 1607 if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .charAt( o1 ) )) 1608 return -1; 1609 offset = o1; 1610 } 1611 } else { 1612 int ch = op.getData(); 1613 if (dx > 0) { 1614 if (offset >= con.limit || ch != target .charAt( offset ) ) 1615 return -1; 1616 offset ++; 1617 } else { 1618 int o1 = offset-1; 1619 if (o1 >= con.limit || o1 < 0 || ch != target .charAt( o1 ) ) 1620 return -1; 1621 offset = o1; 1622 } 1623 } 1624 op = op.next; 1625 break; 1626 1627 case Op.DOT: 1628 if (dx > 0) { 1629 if (offset >= con.limit) 1630 return -1; 1631 int ch = target .charAt( offset ) ; 1632 if (isSet(opts, SINGLE_LINE)) { 1633 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 1634 offset ++; 1635 } else { 1636 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 1637 ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) ); 1638 if (isEOLChar(ch)) 1639 return -1; 1640 } 1641 offset ++; 1642 } else { 1643 int o1 = offset-1; 1644 if (o1 >= con.limit || o1 < 0) 1645 return -1; 1646 int ch = target .charAt( o1 ) ; 1647 if (isSet(opts, SINGLE_LINE)) { 1648 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 1649 o1 --; 1650 } else { 1651 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 1652 ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch); 1653 if (!isEOLChar(ch)) 1654 return -1; 1655 } 1656 offset = o1; 1657 } 1658 op = op.next; 1659 break; 1660 1661 case Op.RANGE: 1662 case Op.NRANGE: 1663 if (dx > 0) { 1664 if (offset >= con.limit) 1665 return -1; 1666 int ch = target .charAt( offset ) ; 1667 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 1668 ch = REUtil.composeFromSurrogates(ch, target .charAt( ++offset ) ); 1669 RangeToken tok = op.getToken(); 1670 if (isSet(opts, IGNORE_CASE)) { 1671 tok = tok.getCaseInsensitiveToken(); 1672 if (!tok.match(ch)) { 1673 if (ch >= 0x10000) return -1; 1674 char uch; 1675 if (!tok.match(uch = Character.toUpperCase((char)ch)) 1676 && !tok.match(Character.toLowerCase(uch))) 1677 return -1; 1678 } 1679 } else { 1680 if (!tok.match(ch)) return -1; 1681 } 1682 offset ++; 1683 } else { 1684 int o1 = offset-1; 1685 if (o1 >= con.limit || o1 < 0) 1686 return -1; 1687 int ch = target .charAt( o1 ) ; 1688 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 1689 ch = REUtil.composeFromSurrogates( target .charAt( --o1 ) , ch); 1690 RangeToken tok = op.getToken(); 1691 if (isSet(opts, IGNORE_CASE)) { 1692 tok = tok.getCaseInsensitiveToken(); 1693 if (!tok.match(ch)) { 1694 if (ch >= 0x10000) return -1; 1695 char uch; 1696 if (!tok.match(uch = Character.toUpperCase((char)ch)) 1697 && !tok.match(Character.toLowerCase(uch))) 1698 return -1; 1699 } 1700 } else { 1701 if (!tok.match(ch)) return -1; 1702 } 1703 offset = o1; 1704 } 1705 op = op.next; 1706 break; 1707 1708 case Op.ANCHOR: 1709 boolean go = false; 1710 switch (op.getData()) { 1711 case '^': 1712 if (isSet(opts, MULTIPLE_LINES)) { 1713 if (!(offset == con.start 1714 || offset > con.start && isEOLChar( target .charAt( offset-1 ) ))) 1715 return -1; 1716 } else { 1717 if (offset != con.start) 1718 return -1; 1719 } 1720 break; 1721 1722 case '@': // Internal use only. 1723 // The @ always matches line beginnings. 1724 if (!(offset == con.start 1725 || offset > con.start && isEOLChar( target .charAt( offset-1 ) ))) 1726 return -1; 1727 break; 1728 1729 case '$': 1730 if (isSet(opts, MULTIPLE_LINES)) { 1731 if (!(offset == con.limit 1732 || offset < con.limit && isEOLChar( target .charAt( offset ) ))) 1733 return -1; 1734 } else { 1735 if (!(offset == con.limit 1736 || offset+1 == con.limit && isEOLChar( target .charAt( offset ) ) 1737 || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN 1738 && target .charAt( offset+1 ) == LINE_FEED)) 1739 return -1; 1740 } 1741 break; 1742 1743 case 'A': 1744 if (offset != con.start) return -1; 1745 break; 1746 1747 case 'Z': 1748 if (!(offset == con.limit 1749 || offset+1 == con.limit && isEOLChar( target .charAt( offset ) ) 1750 || offset+2 == con.limit && target .charAt( offset ) == CARRIAGE_RETURN 1751 && target .charAt( offset+1 ) == LINE_FEED)) 1752 return -1; 1753 break; 1754 1755 case 'z': 1756 if (offset != con.limit) return -1; 1757 break; 1758 1759 case 'b': 1760 if (con.length == 0) return -1; 1761 { 1762 int after = getWordType(target, con.start, con.limit, offset, opts); 1763 if (after == WT_IGNORE) return -1; 1764 int before = getPreviousWordType(target, con.start, con.limit, offset, opts); 1765 if (after == before) return -1; 1766 } 1767 break; 1768 1769 case 'B': 1770 if (con.length == 0) 1771 go = true; 1772 else { 1773 int after = getWordType(target, con.start, con.limit, offset, opts); 1774 go = after == WT_IGNORE 1775 || after == getPreviousWordType(target, con.start, con.limit, offset, opts); 1776 } 1777 if (!go) return -1; 1778 break; 1779 1780 case '<': 1781 if (con.length == 0 || offset == con.limit) return -1; 1782 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER 1783 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) 1784 return -1; 1785 break; 1786 1787 case '>': 1788 if (con.length == 0 || offset == con.start) return -1; 1789 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER 1790 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) 1791 return -1; 1792 break; 1793 } // switch anchor type 1794 op = op.next; 1795 break; 1796 1797 case Op.BACKREFERENCE: 1798 { 1799 int refno = op.getData(); 1800 if (refno <= 0 || refno >= this.nofparen) 1801 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno); 1802 if (con.match.getBeginning(refno) < 0 1803 || con.match.getEnd(refno) < 0) 1804 return -1; // ******** 1805 int o2 = con.match.getBeginning(refno); 1806 int literallen = con.match.getEnd(refno)-o2; 1807 if (!isSet(opts, IGNORE_CASE)) { 1808 if (dx > 0) { 1809 if (!regionMatches(target, offset, con.limit, o2, literallen)) 1810 return -1; 1811 offset += literallen; 1812 } else { 1813 if (!regionMatches(target, offset-literallen, con.limit, o2, literallen)) 1814 return -1; 1815 offset -= literallen; 1816 } 1817 } else { 1818 if (dx > 0) { 1819 if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen)) 1820 return -1; 1821 offset += literallen; 1822 } else { 1823 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 1824 o2, literallen)) 1825 return -1; 1826 offset -= literallen; 1827 } 1828 } 1829 } 1830 op = op.next; 1831 break; 1832 case Op.STRING: 1833 { 1834 String literal = op.getString(); 1835 int literallen = literal.length(); 1836 if (!isSet(opts, IGNORE_CASE)) { 1837 if (dx > 0) { 1838 if (!regionMatches(target, offset, con.limit, literal, literallen)) 1839 return -1; 1840 offset += literallen; 1841 } else { 1842 if (!regionMatches(target, offset-literallen, con.limit, literal, literallen)) 1843 return -1; 1844 offset -= literallen; 1845 } 1846 } else { 1847 if (dx > 0) { 1848 if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen)) 1849 return -1; 1850 offset += literallen; 1851 } else { 1852 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 1853 literal, literallen)) 1854 return -1; 1855 offset -= literallen; 1856 } 1857 } 1858 } 1859 op = op.next; 1860 break; 1861 1862 case Op.CLOSURE: 1863 { 1864 /* 1865 * Saves current position to avoid 1866 * zero-width repeats. 1867 */ 1868 int id = op.getData(); 1869 if (id >= 0) { 1870 int previousOffset = con.offsets[id]; 1871 if (previousOffset < 0 || previousOffset != offset) { 1872 con.offsets[id] = offset; 1873 } else { 1874 con.offsets[id] = -1; 1875 op = op.next; 1876 break; 1877 } 1878 } 1879 int ret = this. matchString (con, op.getChild(), offset, dx, opts); 1880 if (id >= 0) con.offsets[id] = -1; 1881 if (ret >= 0) return ret; 1882 op = op.next; 1883 } 1884 break; 1885 1886 case Op.QUESTION: 1887 { 1888 int ret = this. matchString (con, op.getChild(), offset, dx, opts); 1889 if (ret >= 0) return ret; 1890 op = op.next; 1891 } 1892 break; 1893 1894 case Op.NONGREEDYCLOSURE: 1895 case Op.NONGREEDYQUESTION: 1896 { 1897 int ret = this. matchString (con, op.next, offset, dx, opts); 1898 if (ret >= 0) return ret; 1899 op = op.getChild(); 1900 } 1901 break; 1902 1903 case Op.UNION: 1904 for (int i = 0; i < op.size(); i ++) { 1905 int ret = this. matchString (con, op.elementAt(i), offset, dx, opts); 1906 if (DEBUG) { 1907 System.err.println("UNION: "+i+", ret="+ret); 1908 } 1909 if (ret >= 0) return ret; 1910 } 1911 return -1; 1912 1913 case Op.CAPTURE: 1914 int refno = op.getData(); 1915 if (con.match != null && refno > 0) { 1916 int save = con.match.getBeginning(refno); 1917 con.match.setBeginning(refno, offset); 1918 int ret = this. matchString (con, op.next, offset, dx, opts); 1919 if (ret < 0) con.match.setBeginning(refno, save); 1920 return ret; 1921 } else if (con.match != null && refno < 0) { 1922 int index = -refno; 1923 int save = con.match.getEnd(index); 1924 con.match.setEnd(index, offset); 1925 int ret = this. matchString (con, op.next, offset, dx, opts); 1926 if (ret < 0) con.match.setEnd(index, save); 1927 return ret; 1928 } 1929 op = op.next; 1930 break; 1931 1932 case Op.LOOKAHEAD: 1933 if (0 > this. matchString (con, op.getChild(), offset, 1, opts)) return -1; 1934 op = op.next; 1935 break; 1936 case Op.NEGATIVELOOKAHEAD: 1937 if (0 <= this. matchString (con, op.getChild(), offset, 1, opts)) return -1; 1938 op = op.next; 1939 break; 1940 case Op.LOOKBEHIND: 1941 if (0 > this. matchString (con, op.getChild(), offset, -1, opts)) return -1; 1942 op = op.next; 1943 break; 1944 case Op.NEGATIVELOOKBEHIND: 1945 if (0 <= this. matchString (con, op.getChild(), offset, -1, opts)) return -1; 1946 op = op.next; 1947 break; 1948 1949 case Op.INDEPENDENT: 1950 { 1951 int ret = this. matchString (con, op.getChild(), offset, dx, opts); 1952 if (ret < 0) return ret; 1953 offset = ret; 1954 op = op.next; 1955 } 1956 break; 1957 1958 case Op.MODIFIER: 1959 { 1960 int localopts = opts; 1961 localopts |= op.getData(); 1962 localopts &= ~op.getData2(); 1963 //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16)); 1964 int ret = this. matchString (con, op.getChild(), offset, dx, localopts); 1965 if (ret < 0) return ret; 1966 offset = ret; 1967 op = op.next; 1968 } 1969 break; 1970 1971 case Op.CONDITION: 1972 { 1973 Op.ConditionOp cop = (Op.ConditionOp)op; 1974 boolean matchp = false; 1975 if (cop.refNumber > 0) { 1976 if (cop.refNumber >= this.nofparen) 1977 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber); 1978 matchp = con.match.getBeginning(cop.refNumber) >= 0 1979 && con.match.getEnd(cop.refNumber) >= 0; 1980 } else { 1981 matchp = 0 <= this. matchString (con, cop.condition, offset, dx, opts); 1982 } 1983 1984 if (matchp) { 1985 op = cop.yes; 1986 } else if (cop.no != null) { 1987 op = cop.no; 1988 } else { 1989 op = cop.next; 1990 } 1991 } 1992 break; 1993 1994 default: 1995 throw new RuntimeException("Unknown operation type: "+op.type); 1996 } // switch (op.type) 1997 } // while 1998 } 1999 2000 private static final int getPreviousWordType(String target, int begin, int end, 2001 int offset, int opts) { 2002 int ret = getWordType(target, begin, end, --offset, opts); 2003 while (ret == WT_IGNORE) 2004 ret = getWordType(target, begin, end, --offset, opts); 2005 return ret; 2006 } 2007 2008 private static final int getWordType(String target, int begin, int end, 2009 int offset, int opts) { 2010 if (offset < begin || offset >= end) return WT_OTHER; 2011 return getWordType0( target .charAt( offset ) , opts); 2012 } 2013 2014 2015 private static final boolean regionMatches(String text, int offset, int limit, 2016 String part, int partlen) { 2017 if (limit-offset < partlen) return false; 2018 return text.regionMatches(offset, part, 0, partlen); 2019 } 2020 2021 private static final boolean regionMatches(String text, int offset, int limit, 2022 int offset2, int partlen) { 2023 if (limit-offset < partlen) return false; 2024 return text.regionMatches(offset, text, offset2, partlen); 2025 } 2026 2027 private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit, 2028 String part, int partlen) { 2029 return text.regionMatches(true, offset, part, 0, partlen); 2030 } 2031 2032 private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit, 2033 int offset2, int partlen) { 2034 if (limit-offset < partlen) return false; 2035 return text.regionMatches(true, offset, text, offset2, partlen); 2036 } 2037 2038 2039 2040 2041 2042 2043 2044 /** 2045 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 2046 * 2047 * @return true if the target is matched to this regular expression. 2048 */ 2049 public boolean matches(CharacterIterator target) { 2050 return this.matches(target, (Match)null); 2051 } 2052 2053 2054 /** 2055 * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not. 2056 * 2057 * @param match A Match instance for storing matching result. 2058 * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match. 2059 */ 2060 public boolean matches(CharacterIterator target, Match match) { 2061 int start = target.getBeginIndex(); 2062 int end = target.getEndIndex(); 2063 2064 2065 2066 synchronized (this) { 2067 if (this.operations == null) 2068 this.prepare(); 2069 if (this.context == null) 2070 this.context = new Context(); 2071 } 2072 Context con = null; 2073 synchronized (this.context) { 2074 con = this.context.inuse ? new Context() : this.context; 2075 con.reset(target, start, end, this.numberOfClosures); 2076 } 2077 if (match != null) { 2078 match.setNumberOfGroups(this.nofparen); 2079 match.setSource(target); 2080 } else if (this.hasBackReferences) { 2081 match = new Match(); 2082 match.setNumberOfGroups(this.nofparen); 2083 // Need not to call setSource() because 2084 // a caller can not access this match instance. 2085 } 2086 con.match = match; 2087 2088 if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) { 2089 int matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options); 2090 //System.err.println("DEBUG: matchEnd="+matchEnd); 2091 if (matchEnd == con.limit) { 2092 if (con.match != null) { 2093 con.match.setBeginning(0, con.start); 2094 con.match.setEnd(0, matchEnd); 2095 } 2096 con.inuse = false; 2097 return true; 2098 } 2099 return false; 2100 } 2101 2102 /* 2103 * The pattern has only fixed string. 2104 * The engine uses Boyer-Moore. 2105 */ 2106 if (this.fixedStringOnly) { 2107 //System.err.println("DEBUG: fixed-only: "+this.fixedString); 2108 int o = this.fixedStringTable.matches(target, con.start, con.limit); 2109 if (o >= 0) { 2110 if (con.match != null) { 2111 con.match.setBeginning(0, o); 2112 con.match.setEnd(0, o+this.fixedString.length()); 2113 } 2114 con.inuse = false; 2115 return true; 2116 } 2117 con.inuse = false; 2118 return false; 2119 } 2120 2121 /* 2122 * The pattern contains a fixed string. 2123 * The engine checks with Boyer-Moore whether the text contains the fixed string or not. 2124 * If not, it return with false. 2125 */ 2126 if (this.fixedString != null) { 2127 int o = this.fixedStringTable.matches(target, con.start, con.limit); 2128 if (o < 0) { 2129 //System.err.println("Non-match in fixed-string search."); 2130 con.inuse = false; 2131 return false; 2132 } 2133 } 2134 2135 int limit = con.limit-this.minlength; 2136 int matchStart; 2137 int matchEnd = -1; 2138 2139 /* 2140 * Checks whether the expression starts with ".*". 2141 */ 2142 if (this.operations != null 2143 && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { 2144 if (isSet(this.options, SINGLE_LINE)) { 2145 matchStart = con.start; 2146 matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options); 2147 } else { 2148 boolean previousIsEOL = true; 2149 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2150 int ch = target .setIndex( matchStart ) ; 2151 if (isEOLChar(ch)) { 2152 previousIsEOL = true; 2153 } else { 2154 if (previousIsEOL) { 2155 if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, 2156 matchStart, 1, this.options))) 2157 break; 2158 } 2159 previousIsEOL = false; 2160 } 2161 } 2162 } 2163 } 2164 2165 /* 2166 * Optimization against the first character. 2167 */ 2168 else if (this.firstChar != null) { 2169 //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar); 2170 RangeToken range = this.firstChar; 2171 if (RegularExpression.isSet(this.options, IGNORE_CASE)) { 2172 range = this.firstChar.getCaseInsensitiveToken(); 2173 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2174 int ch = target .setIndex( matchStart ) ; 2175 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) { 2176 ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) ); 2177 if (!range.match(ch)) continue; 2178 } else { 2179 if (!range.match(ch)) { 2180 char ch1 = Character.toUpperCase((char)ch); 2181 if (!range.match(ch1)) 2182 if (!range.match(Character.toLowerCase(ch1))) 2183 continue; 2184 } 2185 } 2186 if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, 2187 matchStart, 1, this.options))) 2188 break; 2189 } 2190 } else { 2191 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2192 int ch = target .setIndex( matchStart ) ; 2193 if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) 2194 ch = REUtil.composeFromSurrogates(ch, target .setIndex( matchStart+1 ) ); 2195 if (!range.match(ch)) continue; 2196 if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, 2197 matchStart, 1, this.options))) 2198 break; 2199 } 2200 } 2201 } 2202 2203 /* 2204 * Straightforward matching. 2205 */ 2206 else { 2207 for (matchStart = con.start; matchStart <= limit; matchStart ++) { 2208 if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, matchStart, 1, this.options))) 2209 break; 2210 } 2211 } 2212 2213 if (matchEnd >= 0) { 2214 if (con.match != null) { 2215 con.match.setBeginning(0, matchStart); 2216 con.match.setEnd(0, matchEnd); 2217 } 2218 con.inuse = false; 2219 return true; 2220 } else { 2221 con.inuse = false; 2222 return false; 2223 } 2224 } 2225 2226 /** 2227 * @return -1 when not match; offset of the end of matched string when match. 2228 */ 2229 private int matchCharacterIterator (Context con, Op op, int offset, int dx, int opts) { 2230 2231 2232 CharacterIterator target = con.ciTarget; 2233 2234 2235 2236 2237 2238 2239 while (true) { 2240 if (op == null) 2241 return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset; 2242 if (offset > con.limit || offset < con.start) 2243 return -1; 2244 switch (op.type) { 2245 case Op.CHAR: 2246 if (isSet(opts, IGNORE_CASE)) { 2247 int ch = op.getData(); 2248 if (dx > 0) { 2249 if (offset >= con.limit || !matchIgnoreCase(ch, target .setIndex( offset ) )) 2250 return -1; 2251 offset ++; 2252 } else { 2253 int o1 = offset-1; 2254 if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch, target .setIndex( o1 ) )) 2255 return -1; 2256 offset = o1; 2257 } 2258 } else { 2259 int ch = op.getData(); 2260 if (dx > 0) { 2261 if (offset >= con.limit || ch != target .setIndex( offset ) ) 2262 return -1; 2263 offset ++; 2264 } else { 2265 int o1 = offset-1; 2266 if (o1 >= con.limit || o1 < 0 || ch != target .setIndex( o1 ) ) 2267 return -1; 2268 offset = o1; 2269 } 2270 } 2271 op = op.next; 2272 break; 2273 2274 case Op.DOT: 2275 if (dx > 0) { 2276 if (offset >= con.limit) 2277 return -1; 2278 int ch = target .setIndex( offset ) ; 2279 if (isSet(opts, SINGLE_LINE)) { 2280 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 2281 offset ++; 2282 } else { 2283 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 2284 ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) ); 2285 if (isEOLChar(ch)) 2286 return -1; 2287 } 2288 offset ++; 2289 } else { 2290 int o1 = offset-1; 2291 if (o1 >= con.limit || o1 < 0) 2292 return -1; 2293 int ch = target .setIndex( o1 ) ; 2294 if (isSet(opts, SINGLE_LINE)) { 2295 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 2296 o1 --; 2297 } else { 2298 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 2299 ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch); 2300 if (!isEOLChar(ch)) 2301 return -1; 2302 } 2303 offset = o1; 2304 } 2305 op = op.next; 2306 break; 2307 2308 case Op.RANGE: 2309 case Op.NRANGE: 2310 if (dx > 0) { 2311 if (offset >= con.limit) 2312 return -1; 2313 int ch = target .setIndex( offset ) ; 2314 if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit) 2315 ch = REUtil.composeFromSurrogates(ch, target .setIndex( ++offset ) ); 2316 RangeToken tok = op.getToken(); 2317 if (isSet(opts, IGNORE_CASE)) { 2318 tok = tok.getCaseInsensitiveToken(); 2319 if (!tok.match(ch)) { 2320 if (ch >= 0x10000) return -1; 2321 char uch; 2322 if (!tok.match(uch = Character.toUpperCase((char)ch)) 2323 && !tok.match(Character.toLowerCase(uch))) 2324 return -1; 2325 } 2326 } else { 2327 if (!tok.match(ch)) return -1; 2328 } 2329 offset ++; 2330 } else { 2331 int o1 = offset-1; 2332 if (o1 >= con.limit || o1 < 0) 2333 return -1; 2334 int ch = target .setIndex( o1 ) ; 2335 if (REUtil.isLowSurrogate(ch) && o1-1 >= 0) 2336 ch = REUtil.composeFromSurrogates( target .setIndex( --o1 ) , ch); 2337 RangeToken tok = op.getToken(); 2338 if (isSet(opts, IGNORE_CASE)) { 2339 tok = tok.getCaseInsensitiveToken(); 2340 if (!tok.match(ch)) { 2341 if (ch >= 0x10000) return -1; 2342 char uch; 2343 if (!tok.match(uch = Character.toUpperCase((char)ch)) 2344 && !tok.match(Character.toLowerCase(uch))) 2345 return -1; 2346 } 2347 } else { 2348 if (!tok.match(ch)) return -1; 2349 } 2350 offset = o1; 2351 } 2352 op = op.next; 2353 break; 2354 2355 case Op.ANCHOR: 2356 boolean go = false; 2357 switch (op.getData()) { 2358 case '^': 2359 if (isSet(opts, MULTIPLE_LINES)) { 2360 if (!(offset == con.start 2361 || offset > con.start && isEOLChar( target .setIndex( offset-1 ) ))) 2362 return -1; 2363 } else { 2364 if (offset != con.start) 2365 return -1; 2366 } 2367 break; 2368 2369 case '@': // Internal use only. 2370 // The @ always matches line beginnings. 2371 if (!(offset == con.start 2372 || offset > con.start && isEOLChar( target .setIndex( offset-1 ) ))) 2373 return -1; 2374 break; 2375 2376 case '$': 2377 if (isSet(opts, MULTIPLE_LINES)) { 2378 if (!(offset == con.limit 2379 || offset < con.limit && isEOLChar( target .setIndex( offset ) ))) 2380 return -1; 2381 } else { 2382 if (!(offset == con.limit 2383 || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) ) 2384 || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN 2385 && target .setIndex( offset+1 ) == LINE_FEED)) 2386 return -1; 2387 } 2388 break; 2389 2390 case 'A': 2391 if (offset != con.start) return -1; 2392 break; 2393 2394 case 'Z': 2395 if (!(offset == con.limit 2396 || offset+1 == con.limit && isEOLChar( target .setIndex( offset ) ) 2397 || offset+2 == con.limit && target .setIndex( offset ) == CARRIAGE_RETURN 2398 && target .setIndex( offset+1 ) == LINE_FEED)) 2399 return -1; 2400 break; 2401 2402 case 'z': 2403 if (offset != con.limit) return -1; 2404 break; 2405 2406 case 'b': 2407 if (con.length == 0) return -1; 2408 { 2409 int after = getWordType(target, con.start, con.limit, offset, opts); 2410 if (after == WT_IGNORE) return -1; 2411 int before = getPreviousWordType(target, con.start, con.limit, offset, opts); 2412 if (after == before) return -1; 2413 } 2414 break; 2415 2416 case 'B': 2417 if (con.length == 0) 2418 go = true; 2419 else { 2420 int after = getWordType(target, con.start, con.limit, offset, opts); 2421 go = after == WT_IGNORE 2422 || after == getPreviousWordType(target, con.start, con.limit, offset, opts); 2423 } 2424 if (!go) return -1; 2425 break; 2426 2427 case '<': 2428 if (con.length == 0 || offset == con.limit) return -1; 2429 if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER 2430 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER) 2431 return -1; 2432 break; 2433 2434 case '>': 2435 if (con.length == 0 || offset == con.start) return -1; 2436 if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER 2437 || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER) 2438 return -1; 2439 break; 2440 } // switch anchor type 2441 op = op.next; 2442 break; 2443 2444 case Op.BACKREFERENCE: 2445 { 2446 int refno = op.getData(); 2447 if (refno <= 0 || refno >= this.nofparen) 2448 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno); 2449 if (con.match.getBeginning(refno) < 0 2450 || con.match.getEnd(refno) < 0) 2451 return -1; // ******** 2452 int o2 = con.match.getBeginning(refno); 2453 int literallen = con.match.getEnd(refno)-o2; 2454 if (!isSet(opts, IGNORE_CASE)) { 2455 if (dx > 0) { 2456 if (!regionMatches(target, offset, con.limit, o2, literallen)) 2457 return -1; 2458 offset += literallen; 2459 } else { 2460 if (!regionMatches(target, offset-literallen, con.limit, o2, literallen)) 2461 return -1; 2462 offset -= literallen; 2463 } 2464 } else { 2465 if (dx > 0) { 2466 if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen)) 2467 return -1; 2468 offset += literallen; 2469 } else { 2470 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 2471 o2, literallen)) 2472 return -1; 2473 offset -= literallen; 2474 } 2475 } 2476 } 2477 op = op.next; 2478 break; 2479 case Op.STRING: 2480 { 2481 String literal = op.getString(); 2482 int literallen = literal.length(); 2483 if (!isSet(opts, IGNORE_CASE)) { 2484 if (dx > 0) { 2485 if (!regionMatches(target, offset, con.limit, literal, literallen)) 2486 return -1; 2487 offset += literallen; 2488 } else { 2489 if (!regionMatches(target, offset-literallen, con.limit, literal, literallen)) 2490 return -1; 2491 offset -= literallen; 2492 } 2493 } else { 2494 if (dx > 0) { 2495 if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen)) 2496 return -1; 2497 offset += literallen; 2498 } else { 2499 if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit, 2500 literal, literallen)) 2501 return -1; 2502 offset -= literallen; 2503 } 2504 } 2505 } 2506 op = op.next; 2507 break; 2508 2509 case Op.CLOSURE: 2510 { 2511 /* 2512 * Saves current position to avoid 2513 * zero-width repeats. 2514 */ 2515 int id = op.getData(); 2516 if (id >= 0) { 2517 int previousOffset = con.offsets[id]; 2518 if (previousOffset < 0 || previousOffset != offset) { 2519 con.offsets[id] = offset; 2520 } else { 2521 con.offsets[id] = -1; 2522 op = op.next; 2523 break; 2524 } 2525 } 2526 2527 int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts); 2528 if (id >= 0) con.offsets[id] = -1; 2529 if (ret >= 0) return ret; 2530 op = op.next; 2531 } 2532 break; 2533 2534 case Op.QUESTION: 2535 { 2536 int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts); 2537 if (ret >= 0) return ret; 2538 op = op.next; 2539 } 2540 break; 2541 2542 case Op.NONGREEDYCLOSURE: 2543 case Op.NONGREEDYQUESTION: 2544 { 2545 int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts); 2546 if (ret >= 0) return ret; 2547 op = op.getChild(); 2548 } 2549 break; 2550 2551 case Op.UNION: 2552 for (int i = 0; i < op.size(); i ++) { 2553 int ret = this. matchCharacterIterator (con, op.elementAt(i), offset, dx, opts); 2554 if (DEBUG) { 2555 System.err.println("UNION: "+i+", ret="+ret); 2556 } 2557 if (ret >= 0) return ret; 2558 } 2559 return -1; 2560 2561 case Op.CAPTURE: 2562 int refno = op.getData(); 2563 if (con.match != null && refno > 0) { 2564 int save = con.match.getBeginning(refno); 2565 con.match.setBeginning(refno, offset); 2566 int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts); 2567 if (ret < 0) con.match.setBeginning(refno, save); 2568 return ret; 2569 } else if (con.match != null && refno < 0) { 2570 int index = -refno; 2571 int save = con.match.getEnd(index); 2572 con.match.setEnd(index, offset); 2573 int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts); 2574 if (ret < 0) con.match.setEnd(index, save); 2575 return ret; 2576 } 2577 op = op.next; 2578 break; 2579 2580 case Op.LOOKAHEAD: 2581 if (0 > this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1; 2582 op = op.next; 2583 break; 2584 case Op.NEGATIVELOOKAHEAD: 2585 if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, 1, opts)) return -1; 2586 op = op.next; 2587 break; 2588 case Op.LOOKBEHIND: 2589 if (0 > this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1; 2590 op = op.next; 2591 break; 2592 case Op.NEGATIVELOOKBEHIND: 2593 if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, -1, opts)) return -1; 2594 op = op.next; 2595 break; 2596 2597 case Op.INDEPENDENT: 2598 { 2599 int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts); 2600 if (ret < 0) return ret; 2601 offset = ret; 2602 op = op.next; 2603 } 2604 break; 2605 2606 case Op.MODIFIER: 2607 { 2608 int localopts = opts; 2609 localopts |= op.getData(); 2610 localopts &= ~op.getData2(); 2611 //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16)); 2612 int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, localopts); 2613 if (ret < 0) return ret; 2614 offset = ret; 2615 op = op.next; 2616 } 2617 break; 2618 2619 case Op.CONDITION: 2620 { 2621 Op.ConditionOp cop = (Op.ConditionOp)op; 2622 boolean matchp = false; 2623 if (cop.refNumber > 0) { 2624 if (cop.refNumber >= this.nofparen) 2625 throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber); 2626 matchp = con.match.getBeginning(cop.refNumber) >= 0 2627 && con.match.getEnd(cop.refNumber) >= 0; 2628 } else { 2629 matchp = 0 <= this. matchCharacterIterator (con, cop.condition, offset, dx, opts); 2630 } 2631 2632 if (matchp) { 2633 op = cop.yes; 2634 } else if (cop.no != null) { 2635 op = cop.no; 2636 } else { 2637 op = cop.next; 2638 } 2639 } 2640 break; 2641 2642 default: 2643 throw new RuntimeException("Unknown operation type: "+op.type); 2644 } // switch (op.type) 2645 } // while 2646 } 2647 2648 private static final int getPreviousWordType(CharacterIterator target, int begin, int end, 2649 int offset, int opts) { 2650 int ret = getWordType(target, begin, end, --offset, opts); 2651 while (ret == WT_IGNORE) 2652 ret = getWordType(target, begin, end, --offset, opts); 2653 return ret; 2654 } 2655 2656 private static final int getWordType(CharacterIterator target, int begin, int end, 2657 int offset, int opts) { 2658 if (offset < begin || offset >= end) return WT_OTHER; 2659 return getWordType0( target .setIndex( offset ) , opts); 2660 } 2661 2662 2663 2664 private static final boolean regionMatches(CharacterIterator target, int offset, int limit, 2665 String part, int partlen) { 2666 if (offset < 0) return false; 2667 if (limit-offset < partlen) 2668 return false; 2669 int i = 0; 2670 while (partlen-- > 0) { 2671 if ( target .setIndex( offset++ ) != part.charAt(i++)) 2672 return false; 2673 } 2674 return true; 2675 } 2676 2677 private static final boolean regionMatches(CharacterIterator target, int offset, int limit, 2678 int offset2, int partlen) { 2679 if (offset < 0) return false; 2680 if (limit-offset < partlen) 2681 return false; 2682 int i = offset2; 2683 while (partlen-- > 0) { 2684 if ( target .setIndex( offset++ ) != target .setIndex( i++ ) ) 2685 return false; 2686 } 2687 return true; 2688 } 2689 2690 /** 2691 * @see java.lang.String#regionMatches 2692 */ 2693 private static final boolean regionMatchesIgnoreCase(CharacterIterator target, int offset, int limit, 2694 String part, int partlen) { 2695 if (offset < 0) return false; 2696 if (limit-offset < partlen) 2697 return false; 2698 int i = 0; 2699 while (partlen-- > 0) { 2700 char ch1 = target .setIndex( offset++ ) ; 2701 char ch2 = part.charAt(i++); 2702 if (ch1 == ch2) 2703 continue; 2704 char uch1 = Character.toUpperCase(ch1); 2705 char uch2 = Character.toUpperCase(ch2); 2706 if (uch1 == uch2) 2707 continue; 2708 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) 2709 return false; 2710 } 2711 return true; 2712 } 2713 2714 private static final boolean regionMatchesIgnoreCase(CharacterIterator target, int offset, int limit, 2715 int offset2, int partlen) { 2716 if (offset < 0) return false; 2717 if (limit-offset < partlen) 2718 return false; 2719 int i = offset2; 2720 while (partlen-- > 0) { 2721 char ch1 = target .setIndex( offset++ ) ; 2722 char ch2 = target .setIndex( i++ ) ; 2723 if (ch1 == ch2) 2724 continue; 2725 char uch1 = Character.toUpperCase(ch1); 2726 char uch2 = Character.toUpperCase(ch2); 2727 if (uch1 == uch2) 2728 continue; 2729 if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2)) 2730 return false; 2731 } 2732 return true; 2733 } 2734 2735 2736 2737 2738 // ================================================================ 2739 2740 /** 2741 * A regular expression. 2742 * @serial 2743 */ 2744 String regex; 2745 /** 2746 * @serial 2747 */ 2748 int options; 2749 2750 /** 2751 * The number of parenthesis in the regular expression. 2752 * @serial 2753 */ 2754 int nofparen; 2755 /** 2756 * Internal representation of the regular expression. 2757 * @serial 2758 */ 2759 Token tokentree; 2760 2761 boolean hasBackReferences = false; 2762 2763 transient int minlength; 2764 transient Op operations = null; 2765 transient int numberOfClosures; 2766 transient Context context = null; 2767 transient RangeToken firstChar = null; 2768 2769 transient String fixedString = null; 2770 transient int fixedStringOptions; 2771 transient BMPattern fixedStringTable = null; 2772 transient boolean fixedStringOnly = false; 2773 2774 2775 static final class Context { 2776 CharacterIterator ciTarget; 2777 String strTarget; 2778 char[] charTarget; 2779 int start; 2780 int limit; 2781 int length; 2782 Match match; 2783 boolean inuse = false; 2784 int[] offsets; 2785 2786 Context() { 2787 } 2788 2789 private void resetCommon(int nofclosures) { 2790 this.length = this.limit-this.start; 2791 this.inuse = true; 2792 this.match = null; 2793 if (this.offsets == null || this.offsets.length != nofclosures) 2794 this.offsets = new int[nofclosures]; 2795 for (int i = 0; i < nofclosures; i ++) this.offsets[i] = -1; 2796 } 2797 void reset(CharacterIterator target, int start, int limit, int nofclosures) { 2798 this.ciTarget = target; 2799 this.start = start; 2800 this.limit = limit; 2801 this.resetCommon(nofclosures); 2802 } 2803 void reset(String target, int start, int limit, int nofclosures) { 2804 this.strTarget = target; 2805 this.start = start; 2806 this.limit = limit; 2807 this.resetCommon(nofclosures); 2808 } 2809 void reset(char[] target, int start, int limit, int nofclosures) { 2810 this.charTarget = target; 2811 this.start = start; 2812 this.limit = limit; 2813 this.resetCommon(nofclosures); 2814 } 2815 } 2816 2817 /** 2818 * Prepares for matching. This method is called just before starting matching. 2819 */ 2820 void prepare() { 2821 if (Op.COUNT) Op.nofinstances = 0; 2822 this.compile(this.tokentree); 2823 /* 2824 if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .* 2825 Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@'); 2826 anchor.next = this.operations; 2827 this.operations = anchor; 2828 } 2829 */ 2830 if (Op.COUNT) System.err.println("DEBUG: The number of operations: "+Op.nofinstances); 2831 2832 this.minlength = this.tokentree.getMinLength(); 2833 2834 this.firstChar = null; 2835 if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION) 2836 && !isSet(this.options, XMLSCHEMA_MODE)) { 2837 RangeToken firstChar = Token.createRange(); 2838 int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options); 2839 if (fresult == Token.FC_TERMINAL) { 2840 firstChar.compactRanges(); 2841 this.firstChar = firstChar; 2842 if (DEBUG) 2843 System.err.println("DEBUG: Use the first character optimization: "+firstChar); 2844 } 2845 } 2846 2847 if (this.operations != null 2848 && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR) 2849 && this.operations.next == null) { 2850 if (DEBUG) 2851 System.err.print(" *** Only fixed string! *** "); 2852 this.fixedStringOnly = true; 2853 if (this.operations.type == Op.STRING) 2854 this.fixedString = this.operations.getString(); 2855 else if (this.operations.getData() >= 0x10000) { // Op.CHAR 2856 this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData()); 2857 } else { 2858 char[] ac = new char[1]; 2859 ac[0] = (char)this.operations.getData(); 2860 this.fixedString = new String(ac); 2861 } 2862 this.fixedStringOptions = this.options; 2863 this.fixedStringTable = new BMPattern(this.fixedString, 256, 2864 isSet(this.fixedStringOptions, IGNORE_CASE)); 2865 } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION) 2866 && !isSet(this.options, XMLSCHEMA_MODE)) { 2867 Token.FixedStringContainer container = new Token.FixedStringContainer(); 2868 this.tokentree.findFixedString(container, this.options); 2869 this.fixedString = container.token == null ? null : container.token.getString(); 2870 this.fixedStringOptions = container.options; 2871 if (this.fixedString != null && this.fixedString.length() < 2) 2872 this.fixedString = null; 2873 // This pattern has a fixed string of which length is more than one. 2874 if (this.fixedString != null) { 2875 this.fixedStringTable = new BMPattern(this.fixedString, 256, 2876 isSet(this.fixedStringOptions, IGNORE_CASE)); 2877 if (DEBUG) { 2878 System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length() 2879 +"/" //+this.fixedString 2880 +"/"+REUtil.createOptionString(this.fixedStringOptions)); 2881 System.err.print("String: "); 2882 REUtil.dumpString(this.fixedString); 2883 } 2884 } 2885 } 2886 } 2887 2888 /** 2889 * An option. 2890 * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span> 2891 * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span> 2892 * does not capture. 2893 * 2894 * @see #RegularExpression(java.lang.String,int) 2895 * @see #setPattern(java.lang.String,int) 2896 static final int MARK_PARENS = 1<<0; 2897 */ 2898 2899 /** 2900 * "i" 2901 */ 2902 static final int IGNORE_CASE = 1<<1; 2903 2904 /** 2905 * "s" 2906 */ 2907 static final int SINGLE_LINE = 1<<2; 2908 2909 /** 2910 * "m" 2911 */ 2912 static final int MULTIPLE_LINES = 1<<3; 2913 2914 /** 2915 * "x" 2916 */ 2917 static final int EXTENDED_COMMENT = 1<<4; 2918 2919 /** 2920 * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>. 2921 * 2922 * @see #RegularExpression(java.lang.String,int) 2923 * @see #setPattern(java.lang.String,int) 2924 * @see #UNICODE_WORD_BOUNDARY 2925 */ 2926 static final int USE_UNICODE_CATEGORY = 1<<5; // "u" 2927 2928 /** 2929 * An option. 2930 * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \< \></kbd></span>. 2931 * <p>By default, the engine considers a position between a word character 2932 * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character 2933 * is a word boundary. 2934 * <p>By this option, the engine checks word boundaries with the method of 2935 * 'Unicode Regular Expression Guidelines' Revision 4. 2936 * 2937 * @see #RegularExpression(java.lang.String,int) 2938 * @see #setPattern(java.lang.String,int) 2939 */ 2940 static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w" 2941 2942 /** 2943 * "H" 2944 */ 2945 static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7; 2946 /** 2947 * "F" 2948 */ 2949 static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8; 2950 /** 2951 * "X". XML Schema mode. 2952 */ 2953 static final int XMLSCHEMA_MODE = 1<<9; 2954 /** 2955 * ",". 2956 */ 2957 static final int SPECIAL_COMMA = 1<<10; 2958 2959 2960 private static final boolean isSet(int options, int flag) { 2961 return (options & flag) == flag; 2962 } 2963 2964 /** 2965 * Creates a new RegularExpression instance. 2966 * 2967 * @param regex A regular expression 2968 * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. 2969 */ 2970 public RegularExpression(String regex) throws ParseException { 2971 this.setPattern(regex, null); 2972 } 2973 2974 /** 2975 * Creates a new RegularExpression instance with options. 2976 * 2977 * @param regex A regular expression 2978 * @param options A String consisted of "i" "m" "s" "u" "w" "," "X" 2979 * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax. 2980 */ 2981 public RegularExpression(String regex, String options) throws ParseException { 2982 this.setPattern(regex, options); 2983 } 2984 2985 RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) { 2986 this.regex = regex; 2987 this.tokentree = tok; 2988 this.nofparen = parens; 2989 this.options = options; 2990 this.hasBackReferences = hasBackReferences; 2991 } 2992 2993 /** 2994 * 2995 */ 2996 public void setPattern(String newPattern) throws ParseException { 2997 this.setPattern(newPattern, this.options); 2998 } 2999 3000 private void setPattern(String newPattern, int options) throws ParseException { 3001 this.regex = newPattern; 3002 this.options = options; 3003 RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE) 3004 ? new ParserForXMLSchema() : new RegexParser(); 3005 this.tokentree = rp.parse(this.regex, this.options); 3006 this.nofparen = rp.parennumber; 3007 this.hasBackReferences = rp.hasBackReferences; 3008 3009 this.operations = null; 3010 this.context = null; 3011 } 3012 /** 3013 * 3014 */ 3015 public void setPattern(String newPattern, String options) throws ParseException { 3016 this.setPattern(newPattern, REUtil.parseOptions(options)); 3017 } 3018 3019 /** 3020 * 3021 */ 3022 public String getPattern() { 3023 return this.regex; 3024 } 3025 3026 /** 3027 * Represents this instence in String. 3028 */ 3029 public String toString() { 3030 return this.tokentree.toString(this.options); 3031 } 3032 3033 /** 3034 * Returns a option string. 3035 * The order of letters in it may be different from a string specified 3036 * in a constructor or <code>setPattern()</code>. 3037 * 3038 * @see #RegularExpression(java.lang.String,java.lang.String) 3039 * @see #setPattern(java.lang.String,java.lang.String) 3040 */ 3041 public String getOptions() { 3042 return REUtil.createOptionString(this.options); 3043 } 3044 3045 /** 3046 * Return true if patterns are the same and the options are equivalent. 3047 */ 3048 public boolean equals(Object obj) { 3049 if (obj == null) return false; 3050 if (!(obj instanceof RegularExpression)) 3051 return false; 3052 RegularExpression r = (RegularExpression)obj; 3053 return this.regex.equals(r.regex) && this.options == r.options; 3054 } 3055 3056 boolean equals(String pattern, int options) { 3057 return this.regex.equals(pattern) && this.options == options; 3058 } 3059 3060 /** 3061 * 3062 */ 3063 public int hashCode() { 3064 return (this.regex+"/"+this.getOptions()).hashCode(); 3065 } 3066 3067 /** 3068 * Return the number of regular expression groups. 3069 * This method returns 1 when the regular expression has no capturing-parenthesis. 3070 * 3071 */ 3072 public int getNumberOfGroups() { 3073 return this.nofparen; 3074 } 3075 3076 // ================================================================ 3077 3078 private static final int WT_IGNORE = 0; 3079 private static final int WT_LETTER = 1; 3080 private static final int WT_OTHER = 2; 3081 private static final int getWordType0(char ch, int opts) { 3082 if (!isSet(opts, UNICODE_WORD_BOUNDARY)) { 3083 if (isSet(opts, USE_UNICODE_CATEGORY)) { 3084 return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER; 3085 } 3086 return isWordChar(ch) ? WT_LETTER : WT_OTHER; 3087 } 3088 3089 switch (Character.getType(ch)) { 3090 case Character.UPPERCASE_LETTER: // L 3091 case Character.LOWERCASE_LETTER: // L 3092 case Character.TITLECASE_LETTER: // L 3093 case Character.MODIFIER_LETTER: // L 3094 case Character.OTHER_LETTER: // L 3095 case Character.LETTER_NUMBER: // N 3096 case Character.DECIMAL_DIGIT_NUMBER: // N 3097 case Character.OTHER_NUMBER: // N 3098 case Character.COMBINING_SPACING_MARK: // Mc 3099 return WT_LETTER; 3100 3101 case Character.FORMAT: // Cf 3102 case Character.NON_SPACING_MARK: // Mn 3103 case Character.ENCLOSING_MARK: // Mc 3104 return WT_IGNORE; 3105 3106 case Character.CONTROL: // Cc 3107 switch (ch) { 3108 case '\t': 3109 case '\n': 3110 case '\u000B': 3111 case '\f': 3112 case '\r': 3113 return WT_OTHER; 3114 default: 3115 return WT_IGNORE; 3116 } 3117 3118 default: 3119 return WT_OTHER; 3120 } 3121 } 3122 3123 // ================================================================ 3124 3125 static final int LINE_FEED = 0x000A; 3126 static final int CARRIAGE_RETURN = 0x000D; 3127 static final int LINE_SEPARATOR = 0x2028; 3128 static final int PARAGRAPH_SEPARATOR = 0x2029; 3129 3130 private static final boolean isEOLChar(int ch) { 3131 return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR 3132 || ch == PARAGRAPH_SEPARATOR; 3133 } 3134 3135 private static final boolean isWordChar(int ch) { // Legacy word characters 3136 if (ch == '_') return true; 3137 if (ch < '0') return false; 3138 if (ch > 'z') return false; 3139 if (ch <= '9') return true; 3140 if (ch < 'A') return false; 3141 if (ch <= 'Z') return true; 3142 if (ch < 'a') return false; 3143 return true; 3144 } 3145 3146 private static final boolean matchIgnoreCase(int chardata, int ch) { 3147 if (chardata == ch) return true; 3148 if (chardata > 0xffff || ch > 0xffff) return false; 3149 char uch1 = Character.toUpperCase((char)chardata); 3150 char uch2 = Character.toUpperCase((char)ch); 3151 if (uch1 == uch2) return true; 3152 return Character.toLowerCase(uch1) == Character.toLowerCase(uch2); 3153 } 3154 }