Home » Xerces-J-src.2.9.1 » org.apache.xerces » impl » xpath » regex » [javadoc | source]

    1   /*
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    * 
    9    *      http://www.apache.org/licenses/LICENSE-2.0
   10    * 
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.xerces.impl.xpath.regex;
   19   
   20   import java.text.CharacterIterator;
   21   
   22   /**
   23    * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
   24    * This engine does not conform to the POSIX regular expression.
   25    *
   26    * <hr width="50%">
   27    * <h3>How to use</h3>
   28    *
   29    * <dl>
   30    *   <dt>A. Standard way
   31    *   <dd>
   32    * <pre>
   33    * RegularExpression re = new RegularExpression(<var>regex</var>);
   34    * if (re.matches(text)) { ... }
   35    * </pre>
   36    *
   37    *   <dt>B. Capturing groups
   38    *   <dd>
   39    * <pre>
   40    * RegularExpression re = new RegularExpression(<var>regex</var>);
   41    * Match match = new Match();
   42    * if (re.matches(text, match)) {
   43    *     ... // You can refer captured texts with methods of the <code>Match</code> class.
   44    * }
   45    * </pre>
   46    *
   47    * </dl>
   48    *
   49    * <h4>Case-insensitive matching</h4>
   50    * <pre>
   51    * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
   52    * if (re.matches(text) >= 0) { ...}
   53    * </pre>
   54    *
   55    * <h4>Options</h4>
   56    * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
   57    *    or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
   58    *    This <var>options</var> parameter consists of the following characters.
   59    * </p>
   60    * <dl>
   61    *   <dt><a name="I_OPTION"><code>"i"</code></a>
   62    *   <dd>This option indicates case-insensitive matching.
   63    *   <dt><a name="M_OPTION"><code>"m"</code></a>
   64    *   <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
   65    *   <dt><a name="S_OPTION"><code>"s"</code></a>
   66    *   <dd class="REGEX"><kbd>.</kbd> matches any one character.
   67    *   <dt><a name="U_OPTION"><code>"u"</code></a>
   68    *   <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
   69    *   <dt><a name="W_OPTION"><code>"w"</code></a>
   70    *   <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
   71    *      'Unicode Regular Expression Guidelines' Revision 4.
   72    *      When "w" and "u" are specified at the same time,
   73    *      <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
   74    *   <dt><a name="COMMA_OPTION"><code>","</code></a>
   75    *   <dd>The parser treats a comma in a character class as a range separator.
   76    *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
   77    *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
   78    *
   79    *   <dt><a name="X_OPTION"><code>"X"</code></a>
   80    *   <dd class="REGEX">
   81    *       By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
   82    *       The <code>match()</code> method does not do subsring matching
   83    *       but entire string matching.
   84    *
   85    * </dl>
   86    * 
   87    * <hr width="50%">
   88    * <h3>Syntax</h3>
   89    * <table border="1" bgcolor="#ddeeff">
   90    *   <tr>
   91    *    <td>
   92    *     <h4>Differences from the Perl 5 regular expression</h4>
   93    *     <ul>
   94    *      <li>There is 6-digit hexadecimal character representation  (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
   95    *      <li>Supports subtraction, union, and intersection operations for character classes.
   96    *      <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
   97    *          <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
   98    *          <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
   99    *          <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
  100    *          <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
  101    *     </ul>
  102    *    </td>
  103    *   </tr>
  104    * </table>
  105    *
  106    * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
  107    * <ul>
  108    *   <li>Character
  109    *     <dl>
  110    *       <dt class="REGEX"><kbd>.</kbd> (A period)
  111    *       <dd>Matches any one character except the following characters.
  112    *       <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
  113    *           PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
  114    *       <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
  115    *       <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
  116    *           it matches any character including the above four characters.
  117    *
  118    *       <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
  119    *       <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
  120    *           CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
  121    *
  122    *       <dt class="REGEX"><kbd>\c</kbd><var>C</var>
  123    *       <dd>Matches a control character.
  124    *           The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
  125    *           '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
  126    *           It matches a control character of which the character code is less than
  127    *           the character code of the <var>C</var> by 0x0040.
  128    *       <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
  129    *           and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
  130    *
  131    *       <dt class="REGEX">a non-meta character
  132    *       <dd>Matches the character.
  133    *
  134    *       <dt class="REGEX"><KBD>\</KBD> + a meta character
  135    *       <dd>Matches the meta character.
  136    *
  137    *       <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
  138    *       <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
  139    *           You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
  140    *           variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
  141    *
  142    *       <!--
  143    *       <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
  144    *       <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
  145    *       -->
  146    *
  147    *       <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
  148    *       <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
  149    *
  150    *       <dt class="REGEX"><kbd>\g</kbd>
  151    *       <dd>Matches a grapheme.
  152    *       <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
  153    *
  154    *       <dt class="REGEX"><kbd>\X</kbd>
  155    *       <dd class="REGEX">Matches a combining character sequence.
  156    *       It is equivalent to <kbd>(?:\PM\pM*)</kbd>
  157    *     </dl>
  158    *   </li>
  159    *
  160    *   <li>Character class
  161    *     <dl>
  162   + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
  163   + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
  164    *       <dd>Positive character class.  It matches a character in ranges.
  165    *       <dd><var>R<sub>n</sub></var>:
  166    *       <ul>
  167    *         <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
  168    *             <p>This range matches the character.
  169    *         <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
  170    *             <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
  171   + *         <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
  172   + *             and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
  173    *             <p>...
  174    *         <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
  175    *             <p>These expressions specifies the same ranges as the following expressions.
  176    *       </ul>
  177    *       <p class="REGEX">Enumerated ranges are merged (union operation).
  178    *          <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
  179    *
  180    *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
  181    *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
  182    *       <dd>Negative character class.  It matches a character not in ranges.
  183    *
  184    *       <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
  185    *       (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
  186    *       <dd>Subtraction or union or intersection for character classes.
  187    *       <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
  188    *       <dd>The result of this operations is a <u>positive character class</u>
  189    *           even if an expression includes any negative character classes.
  190    *           You have to take care on this in case-insensitive matching.
  191    *           For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
  192    *           which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
  193    *           But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
  194    *           it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
  195    *           though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
  196    *
  197    *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
  198    *       <dd>Character class subtraction for the XML Schema.
  199    *           You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
  200    *           
  201    *       <dt class="REGEX"><kbd>\d</kbd>
  202    *       <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
  203    *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  204    *           <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
  205    *
  206    *       <dt class="REGEX"><kbd>\D</kbd>
  207    *       <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
  208    *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  209    *           <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
  210    *
  211    *       <dt class="REGEX"><kbd>\s</kbd>
  212    *       <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
  213    *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  214    *           <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
  215    *
  216    *       <dt class="REGEX"><kbd>\S</kbd>
  217    *       <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
  218    *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  219    *           <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
  220    *
  221    *       <dt class="REGEX"><kbd>\w</kbd>
  222    *       <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
  223    *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  224    *           <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
  225    *
  226    *       <dt class="REGEX"><kbd>\W</kbd>
  227    *       <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
  228    *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  229    *           <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
  230    *
  231    *       <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
  232    *       <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
  233    *       The following names are available:
  234    *       <dl>
  235    *         <dt>Unicode General Categories:
  236    *         <dd><kbd>
  237    *       L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
  238    *       Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
  239    *         </kbd>
  240    *         <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
  241    *         <dt>Unicode Blocks:
  242    *         <dd><kbd>
  243    *       Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
  244    *       IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
  245    *       Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
  246    *       Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
  247    *       Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
  248    *       Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
  249    *       Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
  250    *       Miscellaneous Technical, Control Pictures, Optical Character Recognition,
  251    *       Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
  252    *       Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
  253    *       Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
  254    *       Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
  255    *       Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
  256    *       Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
  257    *       Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
  258    *       Small Form Variants, Arabic Presentation Forms-B, Specials,
  259    *       Halfwidth and Fullwidth Forms
  260    *         </kbd>
  261    *         <dt>Others:
  262    *         <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
  263    *         <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
  264    *         <dd><kbd>UNASSGINED</kbd>
  265    *             (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
  266    *       </dl>
  267    *
  268    *       <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
  269    *       <dd>Matches one character not in the specified General Category or the specified Block.
  270    *     </dl>
  271    *   </li>
  272    *
  273    *   <li>Selection and Quantifier
  274    *     <dl>
  275    *       <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
  276    *       <dd>...
  277    *
  278    *       <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
  279    *       <dd>Matches 0 or more <var>X</var>.
  280    *
  281    *       <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
  282    *       <dd>Matches 1 or more <var>X</var>.
  283    *
  284    *       <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
  285    *       <dd>Matches 0 or 1 <var>X</var>.
  286    *
  287    *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
  288    *       <dd>Matches <var>number</var> times.
  289    *
  290    *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
  291    *       <dd>...
  292    *
  293    *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
  294    *       <dd>...
  295    *
  296    *       <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
  297    *       <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
  298    *       <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
  299    *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
  300    *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
  301    *       <dd>Non-greedy matching.
  302    *     </dl>
  303    *   </li>
  304    *
  305    *   <li>Grouping, Capturing, and Back-reference
  306    *     <dl>
  307    *       <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
  308    *       <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
  309    *       If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
  310    *       you have to write "<KBD>(?:foo)+</KBD>".
  311    *
  312    *       <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
  313    *       <dd>Grouping with capturing.
  314    * It make a group and applications can know
  315    * where in target text a group matched with methods of a <code>Match</code> instance
  316    * after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
  317    * The 0th group means whole of this regular expression.
  318    * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
  319    * 
  320    *   <p>For instance, a regular expression is
  321    *   "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
  322    *   and target text is
  323    *   "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
  324    *   <ul>
  325    *     <li><code>Match.getCapturedText(0)</code>:
  326    *     "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
  327    *     <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
  328    *     <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
  329    *   </ul>
  330    *
  331    *       <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
  332    *       <dd>
  333    *
  334    *       <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
  335    *       <dd>Independent expression group. ................
  336    *
  337    *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
  338    *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
  339    *       <dd>............................
  340    *       <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
  341    *           Note that it can not contain 'u'.
  342    *
  343    *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
  344    *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
  345    *       <dd>......
  346    *       <dd>These expressions must be at the beginning of a group.
  347    *     </dl>
  348    *   </li>
  349    *
  350    *   <li>Anchor
  351    *     <dl>
  352    *       <dt class="REGEX"><kbd>\A</kbd>
  353    *       <dd>Matches the beginnig of the text.
  354    *
  355    *       <dt class="REGEX"><kbd>\Z</kbd>
  356    *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
  357    *           or CARRIAGE RETURN + LINE FEED at the end of the text.
  358    *
  359    *       <dt class="REGEX"><kbd>\z</kbd>
  360    *       <dd>Matches the end of the text.
  361    *
  362    *       <dt class="REGEX"><kbd>^</kbd>
  363    *       <dd>Matches the beginning of the text.  It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
  364    *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
  365    *           it matches the beginning of the text, or after one of EOL characters (
  366    *           LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
  367    *           PARAGRAPH SEPARATOR (U+2029).)
  368    *
  369    *       <dt class="REGEX"><kbd>$</kbd>
  370    *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
  371    *           or CARRIAGE RETURN + LINE FEED at the end of the text.
  372    *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
  373    *           it matches the end of the text, or before an EOL character.
  374    *
  375    *       <dt class="REGEX"><kbd>\b</kbd>
  376    *       <dd>Matches word boundary.
  377    *           (See <a href="#W_OPTION">a "w" option</a>)
  378    *
  379    *       <dt class="REGEX"><kbd>\B</kbd>
  380    *       <dd>Matches non word boundary.
  381    *           (See <a href="#W_OPTION">a "w" option</a>)
  382    *
  383    *       <dt class="REGEX"><kbd>\&lt;</kbd>
  384    *       <dd>Matches the beginning of a word.
  385    *           (See <a href="#W_OPTION">a "w" option</a>)
  386    *
  387    *       <dt class="REGEX"><kbd>\&gt;</kbd>
  388    *       <dd>Matches the end of a word.
  389    *           (See <a href="#W_OPTION">a "w" option</a>)
  390    *     </dl>
  391    *   </li>
  392    *   <li>Lookahead and lookbehind
  393    *     <dl>
  394    *       <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
  395    *       <dd>Lookahead.
  396    *
  397    *       <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
  398    *       <dd>Negative lookahead.
  399    *
  400    *       <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
  401    *       <dd>Lookbehind.
  402    *       <dd>(Note for text capturing......)
  403    *
  404    *       <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
  405    *       <dd>Negative lookbehind.
  406    *     </dl>
  407    *   </li>
  408    *
  409    *   <li>Misc.
  410    *     <dl>
  411    *       <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
  412    *       <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
  413    *       <dd>......
  414    *       <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
  415    *       <dd>Comment.  A comment string consists of characters except '<kbd>)</kbd>'.
  416    *           You can not write comments in character classes and before quantifiers.
  417    *     </dl>
  418    *   </li>
  419    * </ul>
  420    *
  421    *
  422    * <hr width="50%">
  423    * <h3>BNF for the regular expression</h3>
  424    * <pre>
  425    * regex ::= ('(?' options ')')? term ('|' term)*
  426    * term ::= factor+
  427    * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
  428    *            | '(?#' [^)]* ')'
  429    * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
  430    * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
  431    *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
  432    *          | '(?>' regex ')' | '(?' options ':' regex ')'
  433    *          | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
  434    * options ::= [imsw]* ('-' [imsw]+)?
  435    * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
  436    * looks ::= '(?=' regex ')'  | '(?!' regex ')'
  437    *           | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
  438    * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
  439    * category-block ::= '\' [pP] category-symbol-1
  440    *                    | ('\p{' | '\P{') (category-symbol | block-name
  441    *                                       | other-properties) '}'
  442    * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
  443    * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
  444    *                     | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
  445    *                     | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
  446    *                     | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
  447    *                     | 'Sm' | 'Sc' | 'Sk' | 'So'
  448    * block-name ::= (See above)
  449    * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
  450    * character-1 ::= (any character except meta-characters)
  451    *
  452    * char-class ::= '[' ranges ']'
  453    *                | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
  454    * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
  455    * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
  456    *           | range-char | range-char '-' range-char
  457    * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
  458    * code-point ::= '\x' hex-char hex-char
  459    *                | '\x{' hex-char+ '}'
  460    * <!--               | '\u005c u' hex-char hex-char hex-char hex-char
  461    * -->               | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
  462    * hex-char ::= [0-9a-fA-F]
  463    * character-2 ::= (any character except \[]-,)
  464    * </pre>
  465    *
  466    * <hr width="50%">
  467    * <h3>TODO</h3>
  468    * <ul>
  469    *   <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
  470    *     <ul>
  471    *       <li>2.4 Canonical Equivalents
  472    *       <li>Level 3
  473    *     </ul>
  474    *   <li>Parsing performance
  475    * </ul>
  476    *
  477    * <hr width="50%">
  478    * 
  479    * @xerces.internal
  480    *
  481    * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
  482    * @version $Id: RegularExpression.java 446721 2006-09-15 20:35:34Z mrglavas $
  483    */
  484   public class RegularExpression implements java.io.Serializable {
  485       
  486       private static final long serialVersionUID = 6242499334195006401L;
  487   
  488       static final boolean DEBUG = false;
  489   
  490       /**
  491        * Compiles a token tree into an operation flow.
  492        */
  493       private synchronized void compile(Token tok) {
  494           if (this.operations != null)
  495               return;
  496           this.numberOfClosures = 0;
  497           this.operations = this.compile(tok, null, false);
  498       }
  499   
  500       /**
  501        * Converts a token to an operation.
  502        */
  503       private Op compile(Token tok, Op next, boolean reverse) {
  504           Op ret;
  505           switch (tok.type) {
  506           case Token.DOT:
  507               ret = Op.createDot();
  508               ret.next = next;
  509               break;
  510   
  511           case Token.CHAR:
  512               ret = Op.createChar(tok.getChar());
  513               ret.next = next;
  514               break;
  515   
  516           case Token.ANCHOR:
  517               ret = Op.createAnchor(tok.getChar());
  518               ret.next = next;
  519               break;
  520   
  521           case Token.RANGE:
  522           case Token.NRANGE:
  523               ret = Op.createRange(tok);
  524               ret.next = next;
  525               break;
  526   
  527           case Token.CONCAT:
  528               ret = next;
  529               if (!reverse) {
  530                   for (int i = tok.size()-1;  i >= 0;  i --) {
  531                       ret = compile(tok.getChild(i), ret, false);
  532                   }
  533               } else {
  534                   for (int i = 0;  i < tok.size();  i ++) {
  535                       ret = compile(tok.getChild(i), ret, true);
  536                   }
  537               }
  538               break;
  539   
  540           case Token.UNION:
  541               Op.UnionOp uni = Op.createUnion(tok.size());
  542               for (int i = 0;  i < tok.size();  i ++) {
  543                   uni.addElement(compile(tok.getChild(i), next, reverse));
  544               }
  545               ret = uni;                          // ret.next is null.
  546               break;
  547   
  548           case Token.CLOSURE:
  549           case Token.NONGREEDYCLOSURE:
  550               Token child = tok.getChild(0);
  551               int min = tok.getMin();
  552               int max = tok.getMax();
  553               if (min >= 0 && min == max) { // {n}
  554                   ret = next;
  555                   for (int i = 0; i < min;  i ++) {
  556                       ret = compile(child, ret, reverse);
  557                   }
  558                   break;
  559               }
  560               if (min > 0 && max > 0)
  561                   max -= min;
  562               if (max > 0) {
  563                   // X{2,6} -> XX(X(X(XX?)?)?)?
  564                   ret = next;
  565                   for (int i = 0;  i < max;  i ++) {
  566                       Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
  567                       q.next = next;
  568                       q.setChild(compile(child, ret, reverse));
  569                       ret = q;
  570                   }
  571               } else {
  572                   Op.ChildOp op;
  573                   if (tok.type == Token.NONGREEDYCLOSURE) {
  574                       op = Op.createNonGreedyClosure();
  575                   } else {                        // Token.CLOSURE
  576                       if (child.getMinLength() == 0)
  577                           op = Op.createClosure(this.numberOfClosures++);
  578                       else
  579                           op = Op.createClosure(-1);
  580                   }
  581                   op.next = next;
  582                   op.setChild(compile(child, op, reverse));
  583                   ret = op;
  584               }
  585               if (min > 0) {
  586                   for (int i = 0;  i < min;  i ++) {
  587                       ret = compile(child, ret, reverse);
  588                   }
  589               }
  590               break;
  591   
  592           case Token.EMPTY:
  593               ret = next;
  594               break;
  595   
  596           case Token.STRING:
  597               ret = Op.createString(tok.getString());
  598               ret.next = next;
  599               break;
  600   
  601           case Token.BACKREFERENCE:
  602               ret = Op.createBackReference(tok.getReferenceNumber());
  603               ret.next = next;
  604               break;
  605   
  606           case Token.PAREN:
  607               if (tok.getParenNumber() == 0) {
  608                   ret = compile(tok.getChild(0), next, reverse);
  609               } else if (reverse) {
  610                   next = Op.createCapture(tok.getParenNumber(), next);
  611                   next = compile(tok.getChild(0), next, reverse);
  612                   ret = Op.createCapture(-tok.getParenNumber(), next);
  613               } else {
  614                   next = Op.createCapture(-tok.getParenNumber(), next);
  615                   next = compile(tok.getChild(0), next, reverse);
  616                   ret = Op.createCapture(tok.getParenNumber(), next);
  617               }
  618               break;
  619   
  620           case Token.LOOKAHEAD:
  621               ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
  622               break;
  623           case Token.NEGATIVELOOKAHEAD:
  624               ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
  625               break;
  626           case Token.LOOKBEHIND:
  627               ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
  628               break;
  629           case Token.NEGATIVELOOKBEHIND:
  630               ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
  631               break;
  632   
  633           case Token.INDEPENDENT:
  634               ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
  635               break;
  636   
  637           case Token.MODIFIERGROUP:
  638               ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
  639                                       ((Token.ModifierToken)tok).getOptions(),
  640                                       ((Token.ModifierToken)tok).getOptionsMask());
  641               break;
  642   
  643           case Token.CONDITION:
  644               Token.ConditionToken ctok = (Token.ConditionToken)tok;
  645               int ref = ctok.refNumber;
  646               Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
  647               Op yes = compile(ctok.yes, next, reverse);
  648               Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
  649               ret = Op.createCondition(next, ref, condition, yes, no);
  650               break;
  651   
  652           default:
  653               throw new RuntimeException("Unknown token type: "+tok.type);
  654           } // switch (tok.type)
  655           return ret;
  656       }
  657   
  658   
  659   //Public
  660   
  661       /**
  662        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  663        *
  664        * @return true if the target is matched to this regular expression.
  665        */
  666       public boolean matches(char[]  target) {
  667           return this.matches(target, 0,  target .length , (Match)null);
  668       }
  669   
  670       /**
  671        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
  672        * in specified range or not.
  673        *
  674        * @param start Start offset of the range.
  675        * @param end  End offset +1 of the range.
  676        * @return true if the target is matched to this regular expression.
  677        */
  678       public boolean matches(char[]  target, int start, int end) {
  679           return this.matches(target, start, end, (Match)null);
  680       }
  681   
  682       /**
  683        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  684        *
  685        * @param match A Match instance for storing matching result.
  686        * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  687        */
  688       public boolean matches(char[]  target, Match match) {
  689           return this.matches(target, 0,  target .length , match);
  690       }
  691   
  692   
  693       /**
  694        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
  695        * in specified range or not.
  696        *
  697        * @param start Start offset of the range.
  698        * @param end  End offset +1 of the range.
  699        * @param match A Match instance for storing matching result.
  700        * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  701        */
  702       public boolean matches(char[]  target, int start, int end, Match match) {
  703   
  704           synchronized (this) {
  705               if (this.operations == null)
  706                   this.prepare();
  707               if (this.context == null)
  708                   this.context = new Context();
  709           }
  710           Context con = null;
  711           synchronized (this.context) {
  712               con = this.context.inuse ? new Context() : this.context;
  713               con.reset(target, start, end, this.numberOfClosures);
  714           }
  715           if (match != null) {
  716               match.setNumberOfGroups(this.nofparen);
  717               match.setSource(target);
  718           } else if (this.hasBackReferences) {
  719               match = new Match();
  720               match.setNumberOfGroups(this.nofparen);
  721               // Need not to call setSource() because
  722               // a caller can not access this match instance.
  723           }
  724           con.match = match;
  725   
  726           if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
  727               int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
  728               //System.err.println("DEBUG: matchEnd="+matchEnd);
  729               if (matchEnd == con.limit) {
  730                   if (con.match != null) {
  731                       con.match.setBeginning(0, con.start);
  732                       con.match.setEnd(0, matchEnd);
  733                   }
  734                   con.inuse = false;
  735                   return true;
  736               }
  737               return false;
  738           }
  739   
  740           /*
  741            * The pattern has only fixed string.
  742            * The engine uses Boyer-Moore.
  743            */
  744           if (this.fixedStringOnly) {
  745               //System.err.println("DEBUG: fixed-only: "+this.fixedString);
  746               int o = this.fixedStringTable.matches(target, con.start, con.limit);
  747               if (o >= 0) {
  748                   if (con.match != null) {
  749                       con.match.setBeginning(0, o);
  750                       con.match.setEnd(0, o+this.fixedString.length());
  751                   }
  752                   con.inuse = false;
  753                   return true;
  754               }
  755               con.inuse = false;
  756               return false;
  757           }
  758   
  759           /*
  760            * The pattern contains a fixed string.
  761            * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
  762            * If not, it return with false.
  763            */
  764           if (this.fixedString != null) {
  765               int o = this.fixedStringTable.matches(target, con.start, con.limit);
  766               if (o < 0) {
  767                   //System.err.println("Non-match in fixed-string search.");
  768                   con.inuse = false;
  769                   return false;
  770               }
  771           }
  772   
  773           int limit = con.limit-this.minlength;
  774           int matchStart;
  775           int matchEnd = -1;
  776   
  777           /*
  778            * Checks whether the expression starts with ".*".
  779            */
  780           if (this.operations != null
  781               && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
  782               if (isSet(this.options, SINGLE_LINE)) {
  783                   matchStart = con.start;
  784                   matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
  785               } else {
  786                   boolean previousIsEOL = true;
  787                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
  788                       int ch =  target [  matchStart ] ;
  789                       if (isEOLChar(ch)) {
  790                           previousIsEOL = true;
  791                       } else {
  792                           if (previousIsEOL) {
  793                               if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
  794                                                                          matchStart, 1, this.options)))
  795                                   break;
  796                           }
  797                           previousIsEOL = false;
  798                       }
  799                   }
  800               }
  801           }
  802   
  803           /*
  804            * Optimization against the first character.
  805            */
  806           else if (this.firstChar != null) {
  807               //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
  808               RangeToken range = this.firstChar;
  809               if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
  810                   range = this.firstChar.getCaseInsensitiveToken();
  811                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
  812                       int ch =  target [  matchStart ] ;
  813                       if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
  814                           ch = REUtil.composeFromSurrogates(ch,  target [  matchStart+1 ] );
  815                           if (!range.match(ch))  continue;
  816                       } else {
  817                           if (!range.match(ch)) {
  818                               char ch1 = Character.toUpperCase((char)ch);
  819                               if (!range.match(ch1))
  820                                   if (!range.match(Character.toLowerCase(ch1)))
  821                                       continue;
  822                           }
  823                       }
  824                       if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
  825                                                                  matchStart, 1, this.options)))
  826                           break;
  827                   }
  828               } else {
  829                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
  830                       int ch =  target [  matchStart ] ;
  831                       if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
  832                           ch = REUtil.composeFromSurrogates(ch,  target [  matchStart+1 ] );
  833                       if (!range.match(ch))  continue;
  834                       if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
  835                                                                  matchStart, 1, this.options)))
  836                           break;
  837                   }
  838               }
  839           }
  840   
  841           /*
  842            * Straightforward matching.
  843            */
  844           else {
  845               for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
  846                   if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options)))
  847                       break;
  848               }
  849           }
  850   
  851           if (matchEnd >= 0) {
  852               if (con.match != null) {
  853                   con.match.setBeginning(0, matchStart);
  854                   con.match.setEnd(0, matchEnd);
  855               }
  856               con.inuse = false;
  857               return true;
  858           } else {
  859               con.inuse = false;
  860               return false;
  861           }
  862       }
  863   
  864   /**
  865    * @return -1 when not match; offset of the end of matched string when match.
  866    */
  867       private int matchCharArray (Context con, Op op, int offset, int dx, int opts) {
  868   
  869           char[] target = con.charTarget;
  870   
  871   
  872           while (true) {
  873               if (op == null)
  874                   return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
  875               if (offset > con.limit || offset < con.start)
  876                   return -1;
  877               switch (op.type) {
  878               case Op.CHAR:
  879                   if (isSet(opts, IGNORE_CASE)) {
  880                       int ch = op.getData();
  881                       if (dx > 0) {
  882                           if (offset >= con.limit || !matchIgnoreCase(ch,  target [  offset ] ))
  883                               return -1;
  884                           offset ++;
  885                       } else {
  886                           int o1 = offset-1;
  887                           if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch,  target [  o1 ] ))
  888                               return -1;
  889                           offset = o1;
  890                       }
  891                   } else {
  892                       int ch = op.getData();
  893                       if (dx > 0) {
  894                           if (offset >= con.limit || ch !=  target [  offset ] )
  895                               return -1;
  896                           offset ++;
  897                       } else {
  898                           int o1 = offset-1;
  899                           if (o1 >= con.limit || o1 < 0 || ch !=  target [  o1 ] )
  900                               return -1;
  901                           offset = o1;
  902                       }
  903                   }
  904                   op = op.next;
  905                   break;
  906   
  907               case Op.DOT:
  908                   if (dx > 0) {
  909                       if (offset >= con.limit)
  910                           return -1;
  911                       int ch =  target [  offset ] ;
  912                       if (isSet(opts, SINGLE_LINE)) {
  913                           if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  914                               offset ++;
  915                       } else {
  916                           if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  917                               ch = REUtil.composeFromSurrogates(ch,  target [  ++offset ] );
  918                           if (isEOLChar(ch))
  919                               return -1;
  920                       }
  921                       offset ++;
  922                   } else {
  923                       int o1 = offset-1;
  924                       if (o1 >= con.limit || o1 < 0)
  925                           return -1;
  926                       int ch =  target [  o1 ] ;
  927                       if (isSet(opts, SINGLE_LINE)) {
  928                           if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  929                               o1 --;
  930                       } else {
  931                           if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  932                               ch = REUtil.composeFromSurrogates( target [  --o1 ] , ch);
  933                           if (!isEOLChar(ch))
  934                               return -1;
  935                       }
  936                       offset = o1;
  937                   }
  938                   op = op.next;
  939                   break;
  940   
  941               case Op.RANGE:
  942               case Op.NRANGE:
  943                   if (dx > 0) {
  944                       if (offset >= con.limit)
  945                           return -1;
  946                       int ch =  target [  offset ] ;
  947                       if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  948                           ch = REUtil.composeFromSurrogates(ch,  target [  ++offset ] );
  949                       RangeToken tok = op.getToken();
  950                       if (isSet(opts, IGNORE_CASE)) {
  951                           tok = tok.getCaseInsensitiveToken();
  952                           if (!tok.match(ch)) {
  953                               if (ch >= 0x10000)  return -1;
  954                               char uch;
  955                               if (!tok.match(uch = Character.toUpperCase((char)ch))
  956                                   && !tok.match(Character.toLowerCase(uch)))
  957                                   return -1;
  958                           }
  959                       } else {
  960                           if (!tok.match(ch))  return -1;
  961                       }
  962                       offset ++;
  963                   } else {
  964                       int o1 = offset-1;
  965                       if (o1 >= con.limit || o1 < 0)
  966                           return -1;
  967                       int ch =  target [  o1 ] ;
  968                       if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  969                           ch = REUtil.composeFromSurrogates( target [  --o1 ] , ch);
  970                       RangeToken tok = op.getToken();
  971                       if (isSet(opts, IGNORE_CASE)) {
  972                           tok = tok.getCaseInsensitiveToken();
  973                           if (!tok.match(ch)) {
  974                               if (ch >= 0x10000)  return -1;
  975                               char uch;
  976                               if (!tok.match(uch = Character.toUpperCase((char)ch))
  977                                   && !tok.match(Character.toLowerCase(uch)))
  978                                   return -1;
  979                           }
  980                       } else {
  981                           if (!tok.match(ch))  return -1;
  982                       }
  983                       offset = o1;
  984                   }
  985                   op = op.next;
  986                   break;
  987   
  988               case Op.ANCHOR:
  989                   boolean go = false;
  990                   switch (op.getData()) {
  991                   case '^':
  992                       if (isSet(opts, MULTIPLE_LINES)) {
  993                           if (!(offset == con.start
  994                                 || offset > con.start && isEOLChar( target [  offset-1 ] )))
  995                               return -1;
  996                       } else {
  997                           if (offset != con.start)
  998                               return -1;
  999                       }
 1000                       break;
 1001   
 1002                   case '@':                         // Internal use only.
 1003                       // The @ always matches line beginnings.
 1004                       if (!(offset == con.start
 1005                             || offset > con.start && isEOLChar( target [  offset-1 ] )))
 1006                           return -1;
 1007                       break;
 1008   
 1009                   case '$':
 1010                       if (isSet(opts, MULTIPLE_LINES)) {
 1011                           if (!(offset == con.limit
 1012                                 || offset < con.limit && isEOLChar( target [  offset ] )))
 1013                               return -1;
 1014                       } else {
 1015                           if (!(offset == con.limit
 1016                                 || offset+1 == con.limit && isEOLChar( target [  offset ] )
 1017                                 || offset+2 == con.limit &&  target [  offset ]  == CARRIAGE_RETURN
 1018                                 &&  target [  offset+1 ]  == LINE_FEED))
 1019                               return -1;
 1020                       }
 1021                       break;
 1022   
 1023                   case 'A':
 1024                       if (offset != con.start)  return -1;
 1025                       break;
 1026   
 1027                   case 'Z':
 1028                       if (!(offset == con.limit
 1029                             || offset+1 == con.limit && isEOLChar( target [  offset ] )
 1030                             || offset+2 == con.limit &&  target [  offset ]  == CARRIAGE_RETURN
 1031                             &&  target [  offset+1 ]  == LINE_FEED))
 1032                           return -1;
 1033                       break;
 1034   
 1035                   case 'z':
 1036                       if (offset != con.limit)  return -1;
 1037                       break;
 1038   
 1039                   case 'b':
 1040                       if (con.length == 0)  return -1;
 1041                       {
 1042                           int after = getWordType(target, con.start, con.limit, offset, opts);
 1043                           if (after == WT_IGNORE)  return -1;
 1044                           int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
 1045                           if (after == before)  return -1;
 1046                       }
 1047                       break;
 1048   
 1049                   case 'B':
 1050                       if (con.length == 0)
 1051                           go = true;
 1052                       else {
 1053                           int after = getWordType(target, con.start, con.limit, offset, opts);
 1054                           go = after == WT_IGNORE
 1055                                || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
 1056                       }
 1057                       if (!go)  return -1;
 1058                       break;
 1059   
 1060                   case '<':
 1061                       if (con.length == 0 || offset == con.limit)  return -1;
 1062                       if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
 1063                           || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
 1064                           return -1;
 1065                       break;
 1066   
 1067                   case '>':
 1068                       if (con.length == 0 || offset == con.start)  return -1;
 1069                       if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
 1070                           || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
 1071                           return -1;
 1072                       break;
 1073                   } // switch anchor type
 1074                   op = op.next;
 1075                   break;
 1076   
 1077               case Op.BACKREFERENCE:
 1078                   {
 1079                       int refno = op.getData();
 1080                       if (refno <= 0 || refno >= this.nofparen)
 1081                           throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
 1082                       if (con.match.getBeginning(refno) < 0
 1083                           || con.match.getEnd(refno) < 0)
 1084                           return -1;                // ********
 1085                       int o2 = con.match.getBeginning(refno);
 1086                       int literallen = con.match.getEnd(refno)-o2;
 1087                       if (!isSet(opts, IGNORE_CASE)) {
 1088                           if (dx > 0) {
 1089                               if (!regionMatches(target, offset, con.limit, o2, literallen))
 1090                                   return -1;
 1091                               offset += literallen;
 1092                           } else {
 1093                               if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
 1094                                   return -1;
 1095                               offset -= literallen;
 1096                           }
 1097                       } else {
 1098                           if (dx > 0) {
 1099                               if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
 1100                                   return -1;
 1101                               offset += literallen;
 1102                           } else {
 1103                               if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
 1104                                                            o2, literallen))
 1105                                   return -1;
 1106                               offset -= literallen;
 1107                           }
 1108                       }
 1109                   }
 1110                   op = op.next;
 1111                   break;
 1112               case Op.STRING:
 1113                   {
 1114                       String literal = op.getString();
 1115                       int literallen = literal.length();
 1116                       if (!isSet(opts, IGNORE_CASE)) {
 1117                           if (dx > 0) {
 1118                               if (!regionMatches(target, offset, con.limit, literal, literallen))
 1119                                   return -1;
 1120                               offset += literallen;
 1121                           } else {
 1122                               if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
 1123                                   return -1;
 1124                               offset -= literallen;
 1125                           }
 1126                       } else {
 1127                           if (dx > 0) {
 1128                               if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
 1129                                   return -1;
 1130                               offset += literallen;
 1131                           } else {
 1132                               if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
 1133                                                            literal, literallen))
 1134                                   return -1;
 1135                               offset -= literallen;
 1136                           }
 1137                       }
 1138                   }
 1139                   op = op.next;
 1140                   break;
 1141   
 1142               case Op.CLOSURE:
 1143                   {
 1144                       /*
 1145                        * Saves current position to avoid
 1146                        * zero-width repeats.
 1147                        */
 1148                       int id = op.getData();
 1149                       if (id >= 0) {
 1150                           int previousOffset = con.offsets[id];
 1151                           if (previousOffset < 0 || previousOffset != offset) {
 1152                               con.offsets[id] = offset;
 1153                           } else {
 1154                               con.offsets[id] = -1;
 1155                               op = op.next;
 1156                               break;
 1157                           }
 1158                       }
 1159   
 1160                       int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
 1161                       if (id >= 0)  con.offsets[id] = -1;
 1162                       if (ret >= 0)  return ret;
 1163                       op = op.next;
 1164                   }
 1165                   break;
 1166   
 1167               case Op.QUESTION:
 1168                   {
 1169                       int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
 1170                       if (ret >= 0)  return ret;
 1171                       op = op.next;
 1172                   }
 1173                   break;
 1174   
 1175               case Op.NONGREEDYCLOSURE:
 1176               case Op.NONGREEDYQUESTION:
 1177                   {
 1178                       int ret = this. matchCharArray (con, op.next, offset, dx, opts);
 1179                       if (ret >= 0)  return ret;
 1180                       op = op.getChild();
 1181                   }
 1182                   break;
 1183   
 1184               case Op.UNION:
 1185                   for (int i = 0;  i < op.size();  i ++) {
 1186                       int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts);
 1187                       if (DEBUG) {
 1188                           System.err.println("UNION: "+i+", ret="+ret);
 1189                       }
 1190                       if (ret >= 0)  return ret;
 1191                   }
 1192                   return -1;
 1193   
 1194               case Op.CAPTURE:
 1195                   int refno = op.getData();
 1196                   if (con.match != null && refno > 0) {
 1197                       int save = con.match.getBeginning(refno);
 1198                       con.match.setBeginning(refno, offset);
 1199                       int ret = this. matchCharArray (con, op.next, offset, dx, opts);
 1200                       if (ret < 0)  con.match.setBeginning(refno, save);
 1201                       return ret;
 1202                   } else if (con.match != null && refno < 0) {
 1203                       int index = -refno;
 1204                       int save = con.match.getEnd(index);
 1205                       con.match.setEnd(index, offset);
 1206                       int ret = this. matchCharArray (con, op.next, offset, dx, opts);
 1207                       if (ret < 0)  con.match.setEnd(index, save);
 1208                       return ret;
 1209                   }
 1210                   op = op.next;
 1211                   break;
 1212   
 1213               case Op.LOOKAHEAD:
 1214                   if (0 > this. matchCharArray (con, op.getChild(), offset, 1, opts))  return -1;
 1215                   op = op.next;
 1216                   break;
 1217               case Op.NEGATIVELOOKAHEAD:
 1218                   if (0 <= this. matchCharArray (con, op.getChild(), offset, 1, opts))  return -1;
 1219                   op = op.next;
 1220                   break;
 1221               case Op.LOOKBEHIND:
 1222                   if (0 > this. matchCharArray (con, op.getChild(), offset, -1, opts))  return -1;
 1223                   op = op.next;
 1224                   break;
 1225               case Op.NEGATIVELOOKBEHIND:
 1226                   if (0 <= this. matchCharArray (con, op.getChild(), offset, -1, opts))  return -1;
 1227                   op = op.next;
 1228                   break;
 1229   
 1230               case Op.INDEPENDENT:
 1231                   {
 1232                       int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
 1233                       if (ret < 0)  return ret;
 1234                       offset = ret;
 1235                       op = op.next;
 1236                   }
 1237                   break;
 1238   
 1239               case Op.MODIFIER:
 1240                   {
 1241                       int localopts = opts;
 1242                       localopts |= op.getData();
 1243                       localopts &= ~op.getData2();
 1244                       //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
 1245                       int ret = this. matchCharArray (con, op.getChild(), offset, dx, localopts);
 1246                       if (ret < 0)  return ret;
 1247                       offset = ret;
 1248                       op = op.next;
 1249                   }
 1250                   break;
 1251   
 1252               case Op.CONDITION:
 1253                   {
 1254                       Op.ConditionOp cop = (Op.ConditionOp)op;
 1255                       boolean matchp = false;
 1256                       if (cop.refNumber > 0) {
 1257                           if (cop.refNumber >= this.nofparen)
 1258                               throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
 1259                           matchp = con.match.getBeginning(cop.refNumber) >= 0
 1260                                    && con.match.getEnd(cop.refNumber) >= 0;
 1261                       } else {
 1262                           matchp = 0 <= this. matchCharArray (con, cop.condition, offset, dx, opts);
 1263                       }
 1264   
 1265                       if (matchp) {
 1266                           op = cop.yes;
 1267                       } else if (cop.no != null) {
 1268                           op = cop.no;
 1269                       } else {
 1270                           op = cop.next;
 1271                       }
 1272                   }
 1273                   break;
 1274   
 1275               default:
 1276                   throw new RuntimeException("Unknown operation type: "+op.type);
 1277               } // switch (op.type)
 1278           } // while
 1279       }
 1280   
 1281       private static final int getPreviousWordType(char[]  target, int begin, int end,
 1282                                                    int offset, int opts) {
 1283           int ret = getWordType(target, begin, end, --offset, opts);
 1284           while (ret == WT_IGNORE)
 1285               ret = getWordType(target, begin, end, --offset, opts);
 1286           return ret;
 1287       }
 1288   
 1289       private static final int getWordType(char[]  target, int begin, int end,
 1290                                            int offset, int opts) {
 1291           if (offset < begin || offset >= end)  return WT_OTHER;
 1292           return getWordType0( target [  offset ] , opts);
 1293       }
 1294   
 1295   
 1296   
 1297       private static final boolean regionMatches(char[]  target, int offset, int limit,
 1298                                                  String part, int partlen) {
 1299           if (offset < 0)  return false;
 1300           if (limit-offset < partlen)
 1301               return false;
 1302           int i = 0;
 1303           while (partlen-- > 0) {
 1304               if ( target [  offset++ ]  != part.charAt(i++))
 1305                   return false;
 1306           }
 1307           return true;
 1308       }
 1309   
 1310       private static final boolean regionMatches(char[]  target, int offset, int limit,
 1311                                                  int offset2, int partlen) {
 1312           if (offset < 0)  return false;
 1313           if (limit-offset < partlen)
 1314               return false;
 1315           int i = offset2;
 1316           while (partlen-- > 0) {
 1317               if ( target [  offset++ ]  !=  target [  i++ ] )
 1318                   return false;
 1319           }
 1320           return true;
 1321       }
 1322   
 1323   /**
 1324    * @see java.lang.String#regionMatches
 1325    */
 1326       private static final boolean regionMatchesIgnoreCase(char[]  target, int offset, int limit,
 1327                                                            String part, int partlen) {
 1328           if (offset < 0)  return false;
 1329           if (limit-offset < partlen)
 1330               return false;
 1331           int i = 0;
 1332           while (partlen-- > 0) {
 1333               char ch1 =  target [  offset++ ] ;
 1334               char ch2 = part.charAt(i++);
 1335               if (ch1 == ch2)
 1336                   continue;
 1337               char uch1 = Character.toUpperCase(ch1);
 1338               char uch2 = Character.toUpperCase(ch2);
 1339               if (uch1 == uch2)
 1340                   continue;
 1341               if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
 1342                   return false;
 1343           }
 1344           return true;
 1345       }
 1346   
 1347       private static final boolean regionMatchesIgnoreCase(char[]  target, int offset, int limit,
 1348                                                            int offset2, int partlen) {
 1349           if (offset < 0)  return false;
 1350           if (limit-offset < partlen)
 1351               return false;
 1352           int i = offset2;
 1353           while (partlen-- > 0) {
 1354               char ch1 =  target [  offset++ ] ;
 1355               char ch2 =  target [  i++ ] ;
 1356               if (ch1 == ch2)
 1357                   continue;
 1358               char uch1 = Character.toUpperCase(ch1);
 1359               char uch2 = Character.toUpperCase(ch2);
 1360               if (uch1 == uch2)
 1361                   continue;
 1362               if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
 1363                   return false;
 1364           }
 1365           return true;
 1366       }
 1367   
 1368   
 1369   
 1370   
 1371       /**
 1372        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 1373        *
 1374        * @return true if the target is matched to this regular expression.
 1375        */
 1376       public boolean matches(String  target) {
 1377           return this.matches(target, 0,  target .length() , (Match)null);
 1378       }
 1379   
 1380       /**
 1381        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 1382        * in specified range or not.
 1383        *
 1384        * @param start Start offset of the range.
 1385        * @param end  End offset +1 of the range.
 1386        * @return true if the target is matched to this regular expression.
 1387        */
 1388       public boolean matches(String  target, int start, int end) {
 1389           return this.matches(target, start, end, (Match)null);
 1390       }
 1391   
 1392       /**
 1393        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 1394        *
 1395        * @param match A Match instance for storing matching result.
 1396        * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 1397        */
 1398       public boolean matches(String  target, Match match) {
 1399           return this.matches(target, 0,  target .length() , match);
 1400       }
 1401   
 1402       /**
 1403        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 1404        * in specified range or not.
 1405        *
 1406        * @param start Start offset of the range.
 1407        * @param end  End offset +1 of the range.
 1408        * @param match A Match instance for storing matching result.
 1409        * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 1410        */
 1411       public boolean matches(String  target, int start, int end, Match match) {
 1412   
 1413           synchronized (this) {
 1414               if (this.operations == null)
 1415                   this.prepare();
 1416               if (this.context == null)
 1417                   this.context = new Context();
 1418           }
 1419           Context con = null;
 1420           synchronized (this.context) {
 1421               con = this.context.inuse ? new Context() : this.context;
 1422               con.reset(target, start, end, this.numberOfClosures);
 1423           }
 1424           if (match != null) {
 1425               match.setNumberOfGroups(this.nofparen);
 1426               match.setSource(target);
 1427           } else if (this.hasBackReferences) {
 1428               match = new Match();
 1429               match.setNumberOfGroups(this.nofparen);
 1430               // Need not to call setSource() because
 1431               // a caller can not access this match instance.
 1432           }
 1433           con.match = match;
 1434   
 1435           if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
 1436               if (DEBUG) {
 1437                   System.err.println("target string="+target);
 1438               }
 1439               int matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
 1440               if (DEBUG) {
 1441                   System.err.println("matchEnd="+matchEnd);
 1442                   System.err.println("con.limit="+con.limit);
 1443               }
 1444               if (matchEnd == con.limit) {
 1445                   if (con.match != null) {
 1446                       con.match.setBeginning(0, con.start);
 1447                       con.match.setEnd(0, matchEnd);
 1448                   }
 1449                   con.inuse = false;
 1450                   return true;
 1451               }
 1452               return false;
 1453           }
 1454   
 1455           /*
 1456            * The pattern has only fixed string.
 1457            * The engine uses Boyer-Moore.
 1458            */
 1459           if (this.fixedStringOnly) {
 1460               //System.err.println("DEBUG: fixed-only: "+this.fixedString);
 1461               int o = this.fixedStringTable.matches(target, con.start, con.limit);
 1462               if (o >= 0) {
 1463                   if (con.match != null) {
 1464                       con.match.setBeginning(0, o);
 1465                       con.match.setEnd(0, o+this.fixedString.length());
 1466                   }
 1467                   con.inuse = false;
 1468                   return true;
 1469               }
 1470               con.inuse = false;
 1471               return false;
 1472           }
 1473   
 1474           /*
 1475            * The pattern contains a fixed string.
 1476            * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
 1477            * If not, it return with false.
 1478            */
 1479           if (this.fixedString != null) {
 1480               int o = this.fixedStringTable.matches(target, con.start, con.limit);
 1481               if (o < 0) {
 1482                   //System.err.println("Non-match in fixed-string search.");
 1483                   con.inuse = false;
 1484                   return false;
 1485               }
 1486           }
 1487   
 1488           int limit = con.limit-this.minlength;
 1489           int matchStart;
 1490           int matchEnd = -1;
 1491   
 1492           /*
 1493            * Checks whether the expression starts with ".*".
 1494            */
 1495           if (this.operations != null
 1496               && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
 1497               if (isSet(this.options, SINGLE_LINE)) {
 1498                   matchStart = con.start;
 1499                   matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
 1500               } else {
 1501                   boolean previousIsEOL = true;
 1502                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 1503                       int ch =  target .charAt(  matchStart ) ;
 1504                       if (isEOLChar(ch)) {
 1505                           previousIsEOL = true;
 1506                       } else {
 1507                           if (previousIsEOL) {
 1508                               if (0 <= (matchEnd = this. matchString (con, this.operations,
 1509                                                                       matchStart, 1, this.options)))
 1510                                   break;
 1511                           }
 1512                           previousIsEOL = false;
 1513                       }
 1514                   }
 1515               }
 1516           }
 1517   
 1518           /*
 1519            * Optimization against the first character.
 1520            */
 1521           else if (this.firstChar != null) {
 1522               //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
 1523               RangeToken range = this.firstChar;
 1524               if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
 1525                   range = this.firstChar.getCaseInsensitiveToken();
 1526                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 1527                       int ch =  target .charAt(  matchStart ) ;
 1528                       if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
 1529                           ch = REUtil.composeFromSurrogates(ch,  target .charAt(  matchStart+1 ) );
 1530                           if (!range.match(ch))  continue;
 1531                       } else {
 1532                           if (!range.match(ch)) {
 1533                               char ch1 = Character.toUpperCase((char)ch);
 1534                               if (!range.match(ch1))
 1535                                   if (!range.match(Character.toLowerCase(ch1)))
 1536                                       continue;
 1537                           }
 1538                       }
 1539                       if (0 <= (matchEnd = this. matchString (con, this.operations,
 1540                                                               matchStart, 1, this.options)))
 1541                           break;
 1542                   }
 1543               } else {
 1544                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 1545                       int ch =  target .charAt(  matchStart ) ;
 1546                       if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
 1547                           ch = REUtil.composeFromSurrogates(ch,  target .charAt(  matchStart+1 ) );
 1548                       if (!range.match(ch))  continue;
 1549                       if (0 <= (matchEnd = this. matchString (con, this.operations,
 1550                                                               matchStart, 1, this.options)))
 1551                           break;
 1552                   }
 1553               }
 1554           }
 1555   
 1556           /*
 1557            * Straightforward matching.
 1558            */
 1559           else {
 1560               for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 1561                   if (0 <= (matchEnd = this. matchString (con, this.operations, matchStart, 1, this.options)))
 1562                       break;
 1563               }
 1564           }
 1565   
 1566           if (matchEnd >= 0) {
 1567               if (con.match != null) {
 1568                   con.match.setBeginning(0, matchStart);
 1569                   con.match.setEnd(0, matchEnd);
 1570               }
 1571               con.inuse = false;
 1572               return true;
 1573           } else {
 1574               con.inuse = false;
 1575               return false;
 1576           }
 1577       }
 1578   
 1579       /**
 1580        * @return -1 when not match; offset of the end of matched string when match.
 1581        */
 1582       private int matchString (Context con, Op op, int offset, int dx, int opts) {
 1583   
 1584   
 1585   
 1586   
 1587           String target = con.strTarget;
 1588   
 1589   
 1590   
 1591   
 1592           while (true) {
 1593               if (op == null)
 1594                   return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
 1595               if (offset > con.limit || offset < con.start)
 1596                   return -1;
 1597               switch (op.type) {
 1598               case Op.CHAR:
 1599                   if (isSet(opts, IGNORE_CASE)) {
 1600                       int ch = op.getData();
 1601                       if (dx > 0) {
 1602                           if (offset >= con.limit || !matchIgnoreCase(ch,  target .charAt(  offset ) ))
 1603                               return -1;
 1604                           offset ++;
 1605                       } else {
 1606                           int o1 = offset-1;
 1607                           if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch,  target .charAt(  o1 ) ))
 1608                               return -1;
 1609                           offset = o1;
 1610                       }
 1611                   } else {
 1612                       int ch = op.getData();
 1613                       if (dx > 0) {
 1614                           if (offset >= con.limit || ch !=  target .charAt(  offset ) )
 1615                               return -1;
 1616                           offset ++;
 1617                       } else {
 1618                           int o1 = offset-1;
 1619                           if (o1 >= con.limit || o1 < 0 || ch !=  target .charAt(  o1 ) )
 1620                               return -1;
 1621                           offset = o1;
 1622                       }
 1623                   }
 1624                   op = op.next;
 1625                   break;
 1626   
 1627               case Op.DOT:
 1628                   if (dx > 0) {
 1629                       if (offset >= con.limit)
 1630                           return -1;
 1631                       int ch =  target .charAt(  offset ) ;
 1632                       if (isSet(opts, SINGLE_LINE)) {
 1633                           if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
 1634                               offset ++;
 1635                       } else {
 1636                           if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
 1637                               ch = REUtil.composeFromSurrogates(ch,  target .charAt(  ++offset ) );
 1638                           if (isEOLChar(ch))
 1639                               return -1;
 1640                       }
 1641                       offset ++;
 1642                   } else {
 1643                       int o1 = offset-1;
 1644                       if (o1 >= con.limit || o1 < 0)
 1645                           return -1;
 1646                       int ch =  target .charAt(  o1 ) ;
 1647                       if (isSet(opts, SINGLE_LINE)) {
 1648                           if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
 1649                               o1 --;
 1650                       } else {
 1651                           if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
 1652                               ch = REUtil.composeFromSurrogates( target .charAt(  --o1 ) , ch);
 1653                           if (!isEOLChar(ch))
 1654                               return -1;
 1655                       }
 1656                       offset = o1;
 1657                   }
 1658                   op = op.next;
 1659                   break;
 1660   
 1661               case Op.RANGE:
 1662               case Op.NRANGE:
 1663                   if (dx > 0) {
 1664                       if (offset >= con.limit)
 1665                           return -1;
 1666                       int ch =  target .charAt(  offset ) ;
 1667                       if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
 1668                           ch = REUtil.composeFromSurrogates(ch,  target .charAt(  ++offset ) );
 1669                       RangeToken tok = op.getToken();
 1670                       if (isSet(opts, IGNORE_CASE)) {
 1671                           tok = tok.getCaseInsensitiveToken();
 1672                           if (!tok.match(ch)) {
 1673                               if (ch >= 0x10000)  return -1;
 1674                               char uch;
 1675                               if (!tok.match(uch = Character.toUpperCase((char)ch))
 1676                                   && !tok.match(Character.toLowerCase(uch)))
 1677                                   return -1;
 1678                           }
 1679                       } else {
 1680                           if (!tok.match(ch))  return -1;
 1681                       }
 1682                       offset ++;
 1683                   } else {
 1684                       int o1 = offset-1;
 1685                       if (o1 >= con.limit || o1 < 0)
 1686                           return -1;
 1687                       int ch =  target .charAt(  o1 ) ;
 1688                       if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
 1689                           ch = REUtil.composeFromSurrogates( target .charAt(  --o1 ) , ch);
 1690                       RangeToken tok = op.getToken();
 1691                       if (isSet(opts, IGNORE_CASE)) {
 1692                           tok = tok.getCaseInsensitiveToken();
 1693                           if (!tok.match(ch)) {
 1694                               if (ch >= 0x10000)  return -1;
 1695                               char uch;
 1696                               if (!tok.match(uch = Character.toUpperCase((char)ch))
 1697                                   && !tok.match(Character.toLowerCase(uch)))
 1698                                   return -1;
 1699                           }
 1700                       } else {
 1701                           if (!tok.match(ch))  return -1;
 1702                       }
 1703                       offset = o1;
 1704                   }
 1705                   op = op.next;
 1706                   break;
 1707   
 1708               case Op.ANCHOR:
 1709                   boolean go = false;
 1710                   switch (op.getData()) {
 1711                   case '^':
 1712                       if (isSet(opts, MULTIPLE_LINES)) {
 1713                           if (!(offset == con.start
 1714                                 || offset > con.start && isEOLChar( target .charAt(  offset-1 ) )))
 1715                               return -1;
 1716                       } else {
 1717                           if (offset != con.start)
 1718                               return -1;
 1719                       }
 1720                       break;
 1721   
 1722                   case '@':                         // Internal use only.
 1723                       // The @ always matches line beginnings.
 1724                       if (!(offset == con.start
 1725                             || offset > con.start && isEOLChar( target .charAt(  offset-1 ) )))
 1726                           return -1;
 1727                       break;
 1728   
 1729                   case '$':
 1730                       if (isSet(opts, MULTIPLE_LINES)) {
 1731                           if (!(offset == con.limit
 1732                                 || offset < con.limit && isEOLChar( target .charAt(  offset ) )))
 1733                               return -1;
 1734                       } else {
 1735                           if (!(offset == con.limit
 1736                                 || offset+1 == con.limit && isEOLChar( target .charAt(  offset ) )
 1737                                 || offset+2 == con.limit &&  target .charAt(  offset )  == CARRIAGE_RETURN
 1738                                 &&  target .charAt(  offset+1 )  == LINE_FEED))
 1739                               return -1;
 1740                       }
 1741                       break;
 1742   
 1743                   case 'A':
 1744                       if (offset != con.start)  return -1;
 1745                       break;
 1746   
 1747                   case 'Z':
 1748                       if (!(offset == con.limit
 1749                             || offset+1 == con.limit && isEOLChar( target .charAt(  offset ) )
 1750                             || offset+2 == con.limit &&  target .charAt(  offset )  == CARRIAGE_RETURN
 1751                             &&  target .charAt(  offset+1 )  == LINE_FEED))
 1752                           return -1;
 1753                       break;
 1754   
 1755                   case 'z':
 1756                       if (offset != con.limit)  return -1;
 1757                       break;
 1758   
 1759                   case 'b':
 1760                       if (con.length == 0)  return -1;
 1761                       {
 1762                           int after = getWordType(target, con.start, con.limit, offset, opts);
 1763                           if (after == WT_IGNORE)  return -1;
 1764                           int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
 1765                           if (after == before)  return -1;
 1766                       }
 1767                       break;
 1768   
 1769                   case 'B':
 1770                       if (con.length == 0)
 1771                           go = true;
 1772                       else {
 1773                           int after = getWordType(target, con.start, con.limit, offset, opts);
 1774                           go = after == WT_IGNORE
 1775                                || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
 1776                       }
 1777                       if (!go)  return -1;
 1778                       break;
 1779   
 1780                   case '<':
 1781                       if (con.length == 0 || offset == con.limit)  return -1;
 1782                       if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
 1783                           || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
 1784                           return -1;
 1785                       break;
 1786   
 1787                   case '>':
 1788                       if (con.length == 0 || offset == con.start)  return -1;
 1789                       if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
 1790                           || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
 1791                           return -1;
 1792                       break;
 1793                   } // switch anchor type
 1794                   op = op.next;
 1795                   break;
 1796   
 1797               case Op.BACKREFERENCE:
 1798                   {
 1799                       int refno = op.getData();
 1800                       if (refno <= 0 || refno >= this.nofparen)
 1801                           throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
 1802                       if (con.match.getBeginning(refno) < 0
 1803                           || con.match.getEnd(refno) < 0)
 1804                           return -1;                // ********
 1805                       int o2 = con.match.getBeginning(refno);
 1806                       int literallen = con.match.getEnd(refno)-o2;
 1807                       if (!isSet(opts, IGNORE_CASE)) {
 1808                           if (dx > 0) {
 1809                               if (!regionMatches(target, offset, con.limit, o2, literallen))
 1810                                   return -1;
 1811                               offset += literallen;
 1812                           } else {
 1813                               if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
 1814                                   return -1;
 1815                               offset -= literallen;
 1816                           }
 1817                       } else {
 1818                           if (dx > 0) {
 1819                               if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
 1820                                   return -1;
 1821                               offset += literallen;
 1822                           } else {
 1823                               if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
 1824                                                            o2, literallen))
 1825                                   return -1;
 1826                               offset -= literallen;
 1827                           }
 1828                       }
 1829                   }
 1830                   op = op.next;
 1831                   break;
 1832               case Op.STRING:
 1833                   {
 1834                       String literal = op.getString();
 1835                       int literallen = literal.length();
 1836                       if (!isSet(opts, IGNORE_CASE)) {
 1837                           if (dx > 0) {
 1838                               if (!regionMatches(target, offset, con.limit, literal, literallen))
 1839                                   return -1;
 1840                               offset += literallen;
 1841                           } else {
 1842                               if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
 1843                                   return -1;
 1844                               offset -= literallen;
 1845                           }
 1846                       } else {
 1847                           if (dx > 0) {
 1848                               if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
 1849                                   return -1;
 1850                               offset += literallen;
 1851                           } else {
 1852                               if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
 1853                                                            literal, literallen))
 1854                                   return -1;
 1855                               offset -= literallen;
 1856                           }
 1857                       }
 1858                   }
 1859                   op = op.next;
 1860                   break;
 1861   
 1862               case Op.CLOSURE:
 1863                   {
 1864                       /*
 1865                        * Saves current position to avoid
 1866                        * zero-width repeats.
 1867                        */
 1868                       int id = op.getData();
 1869                       if (id >= 0) {
 1870                           int previousOffset = con.offsets[id];
 1871                           if (previousOffset < 0 || previousOffset != offset) {
 1872                               con.offsets[id] = offset;
 1873                           } else {
 1874                               con.offsets[id] = -1;
 1875                               op = op.next;
 1876                               break;
 1877                           }
 1878                       }
 1879                       int ret = this. matchString (con, op.getChild(), offset, dx, opts);
 1880                       if (id >= 0)  con.offsets[id] = -1;
 1881                       if (ret >= 0)  return ret;
 1882                       op = op.next;
 1883                   }
 1884                   break;
 1885   
 1886               case Op.QUESTION:
 1887                   {
 1888                       int ret = this. matchString (con, op.getChild(), offset, dx, opts);
 1889                       if (ret >= 0)  return ret;
 1890                       op = op.next;
 1891                   }
 1892                   break;
 1893   
 1894               case Op.NONGREEDYCLOSURE:
 1895               case Op.NONGREEDYQUESTION:
 1896                   {
 1897                       int ret = this. matchString (con, op.next, offset, dx, opts);
 1898                       if (ret >= 0)  return ret;
 1899                       op = op.getChild();
 1900                   }
 1901                   break;
 1902   
 1903               case Op.UNION:
 1904                   for (int i = 0;  i < op.size();  i ++) {
 1905                       int ret = this. matchString (con, op.elementAt(i), offset, dx, opts);
 1906                       if (DEBUG) {
 1907                           System.err.println("UNION: "+i+", ret="+ret);
 1908                       }
 1909                       if (ret >= 0)  return ret;
 1910                   }
 1911                   return -1;
 1912   
 1913               case Op.CAPTURE:
 1914                   int refno = op.getData();
 1915                   if (con.match != null && refno > 0) {
 1916                       int save = con.match.getBeginning(refno);
 1917                       con.match.setBeginning(refno, offset);
 1918                       int ret = this. matchString (con, op.next, offset, dx, opts);
 1919                       if (ret < 0)  con.match.setBeginning(refno, save);
 1920                       return ret;
 1921                   } else if (con.match != null && refno < 0) {
 1922                       int index = -refno;
 1923                       int save = con.match.getEnd(index);
 1924                       con.match.setEnd(index, offset);
 1925                       int ret = this. matchString (con, op.next, offset, dx, opts);
 1926                       if (ret < 0)  con.match.setEnd(index, save);
 1927                       return ret;
 1928                   }
 1929                   op = op.next;
 1930                   break;
 1931   
 1932               case Op.LOOKAHEAD:
 1933                   if (0 > this. matchString (con, op.getChild(), offset, 1, opts))  return -1;
 1934                   op = op.next;
 1935                   break;
 1936               case Op.NEGATIVELOOKAHEAD:
 1937                   if (0 <= this. matchString (con, op.getChild(), offset, 1, opts))  return -1;
 1938                   op = op.next;
 1939                   break;
 1940               case Op.LOOKBEHIND:
 1941                   if (0 > this. matchString (con, op.getChild(), offset, -1, opts))  return -1;
 1942                   op = op.next;
 1943                   break;
 1944               case Op.NEGATIVELOOKBEHIND:
 1945                   if (0 <= this. matchString (con, op.getChild(), offset, -1, opts))  return -1;
 1946                   op = op.next;
 1947                   break;
 1948   
 1949               case Op.INDEPENDENT:
 1950                   {
 1951                       int ret = this. matchString (con, op.getChild(), offset, dx, opts);
 1952                       if (ret < 0)  return ret;
 1953                       offset = ret;
 1954                       op = op.next;
 1955                   }
 1956                   break;
 1957   
 1958               case Op.MODIFIER:
 1959                   {
 1960                       int localopts = opts;
 1961                       localopts |= op.getData();
 1962                       localopts &= ~op.getData2();
 1963                       //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
 1964                       int ret = this. matchString (con, op.getChild(), offset, dx, localopts);
 1965                       if (ret < 0)  return ret;
 1966                       offset = ret;
 1967                       op = op.next;
 1968                   }
 1969                   break;
 1970   
 1971               case Op.CONDITION:
 1972                   {
 1973                       Op.ConditionOp cop = (Op.ConditionOp)op;
 1974                       boolean matchp = false;
 1975                       if (cop.refNumber > 0) {
 1976                           if (cop.refNumber >= this.nofparen)
 1977                               throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
 1978                           matchp = con.match.getBeginning(cop.refNumber) >= 0
 1979                                    && con.match.getEnd(cop.refNumber) >= 0;
 1980                       } else {
 1981                           matchp = 0 <= this. matchString (con, cop.condition, offset, dx, opts);
 1982                       }
 1983   
 1984                       if (matchp) {
 1985                           op = cop.yes;
 1986                       } else if (cop.no != null) {
 1987                           op = cop.no;
 1988                       } else {
 1989                           op = cop.next;
 1990                       }
 1991                   }
 1992                   break;
 1993   
 1994               default:
 1995                   throw new RuntimeException("Unknown operation type: "+op.type);
 1996               } // switch (op.type)
 1997           } // while
 1998       }
 1999   
 2000       private static final int getPreviousWordType(String  target, int begin, int end,
 2001                                                    int offset, int opts) {
 2002           int ret = getWordType(target, begin, end, --offset, opts);
 2003           while (ret == WT_IGNORE)
 2004               ret = getWordType(target, begin, end, --offset, opts);
 2005           return ret;
 2006       }
 2007   
 2008       private static final int getWordType(String  target, int begin, int end,
 2009                                            int offset, int opts) {
 2010           if (offset < begin || offset >= end)  return WT_OTHER;
 2011           return getWordType0( target .charAt(  offset ) , opts);
 2012       }
 2013   
 2014   
 2015       private static final boolean regionMatches(String text, int offset, int limit,
 2016                                                  String part, int partlen) {
 2017           if (limit-offset < partlen)  return false;
 2018           return text.regionMatches(offset, part, 0, partlen);
 2019       }
 2020   
 2021       private static final boolean regionMatches(String text, int offset, int limit,
 2022                                                  int offset2, int partlen) {
 2023           if (limit-offset < partlen)  return false;
 2024           return text.regionMatches(offset, text, offset2, partlen);
 2025       }
 2026   
 2027       private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit,
 2028                                                            String part, int partlen) {
 2029           return text.regionMatches(true, offset, part, 0, partlen);
 2030       }
 2031   
 2032       private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit,
 2033                                                            int offset2, int partlen) {
 2034           if (limit-offset < partlen)  return false;
 2035           return text.regionMatches(true, offset, text, offset2, partlen);
 2036       }
 2037   
 2038   
 2039   
 2040   
 2041   
 2042   
 2043   
 2044       /**
 2045        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 2046        *
 2047        * @return true if the target is matched to this regular expression.
 2048        */
 2049       public boolean matches(CharacterIterator target) {
 2050           return this.matches(target, (Match)null);
 2051       }
 2052   
 2053   
 2054       /**
 2055        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 2056        *
 2057        * @param match A Match instance for storing matching result.
 2058        * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 2059        */
 2060       public boolean matches(CharacterIterator  target, Match match) {
 2061           int start = target.getBeginIndex();
 2062           int end = target.getEndIndex();
 2063   
 2064   
 2065   
 2066           synchronized (this) {
 2067               if (this.operations == null)
 2068                   this.prepare();
 2069               if (this.context == null)
 2070                   this.context = new Context();
 2071           }
 2072           Context con = null;
 2073           synchronized (this.context) {
 2074               con = this.context.inuse ? new Context() : this.context;
 2075               con.reset(target, start, end, this.numberOfClosures);
 2076           }
 2077           if (match != null) {
 2078               match.setNumberOfGroups(this.nofparen);
 2079               match.setSource(target);
 2080           } else if (this.hasBackReferences) {
 2081               match = new Match();
 2082               match.setNumberOfGroups(this.nofparen);
 2083               // Need not to call setSource() because
 2084               // a caller can not access this match instance.
 2085           }
 2086           con.match = match;
 2087   
 2088           if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
 2089               int matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
 2090               //System.err.println("DEBUG: matchEnd="+matchEnd);
 2091               if (matchEnd == con.limit) {
 2092                   if (con.match != null) {
 2093                       con.match.setBeginning(0, con.start);
 2094                       con.match.setEnd(0, matchEnd);
 2095                   }
 2096                   con.inuse = false;
 2097                   return true;
 2098               }
 2099               return false;
 2100           }
 2101   
 2102           /*
 2103            * The pattern has only fixed string.
 2104            * The engine uses Boyer-Moore.
 2105            */
 2106           if (this.fixedStringOnly) {
 2107               //System.err.println("DEBUG: fixed-only: "+this.fixedString);
 2108               int o = this.fixedStringTable.matches(target, con.start, con.limit);
 2109               if (o >= 0) {
 2110                   if (con.match != null) {
 2111                       con.match.setBeginning(0, o);
 2112                       con.match.setEnd(0, o+this.fixedString.length());
 2113                   }
 2114                   con.inuse = false;
 2115                   return true;
 2116               }
 2117               con.inuse = false;
 2118               return false;
 2119           }
 2120   
 2121           /*
 2122            * The pattern contains a fixed string.
 2123            * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
 2124            * If not, it return with false.
 2125            */
 2126           if (this.fixedString != null) {
 2127               int o = this.fixedStringTable.matches(target, con.start, con.limit);
 2128               if (o < 0) {
 2129                   //System.err.println("Non-match in fixed-string search.");
 2130                   con.inuse = false;
 2131                   return false;
 2132               }
 2133           }
 2134   
 2135           int limit = con.limit-this.minlength;
 2136           int matchStart;
 2137           int matchEnd = -1;
 2138   
 2139           /*
 2140            * Checks whether the expression starts with ".*".
 2141            */
 2142           if (this.operations != null
 2143               && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
 2144               if (isSet(this.options, SINGLE_LINE)) {
 2145                   matchStart = con.start;
 2146                   matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
 2147               } else {
 2148                   boolean previousIsEOL = true;
 2149                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 2150                       int ch =  target .setIndex(  matchStart ) ;
 2151                       if (isEOLChar(ch)) {
 2152                           previousIsEOL = true;
 2153                       } else {
 2154                           if (previousIsEOL) {
 2155                               if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
 2156                                                                                  matchStart, 1, this.options)))
 2157                                   break;
 2158                           }
 2159                           previousIsEOL = false;
 2160                       }
 2161                   }
 2162               }
 2163           }
 2164   
 2165           /*
 2166            * Optimization against the first character.
 2167            */
 2168           else if (this.firstChar != null) {
 2169               //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
 2170               RangeToken range = this.firstChar;
 2171               if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
 2172                   range = this.firstChar.getCaseInsensitiveToken();
 2173                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 2174                       int ch =  target .setIndex(  matchStart ) ;
 2175                       if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
 2176                           ch = REUtil.composeFromSurrogates(ch,  target .setIndex(  matchStart+1 ) );
 2177                           if (!range.match(ch))  continue;
 2178                       } else {
 2179                           if (!range.match(ch)) {
 2180                               char ch1 = Character.toUpperCase((char)ch);
 2181                               if (!range.match(ch1))
 2182                                   if (!range.match(Character.toLowerCase(ch1)))
 2183                                       continue;
 2184                           }
 2185                       }
 2186                       if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
 2187                                                                          matchStart, 1, this.options)))
 2188                           break;
 2189                   }
 2190               } else {
 2191                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 2192                       int ch =  target .setIndex(  matchStart ) ;
 2193                       if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
 2194                           ch = REUtil.composeFromSurrogates(ch,  target .setIndex(  matchStart+1 ) );
 2195                       if (!range.match(ch))  continue;
 2196                       if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
 2197                                                                          matchStart, 1, this.options)))
 2198                           break;
 2199                   }
 2200               }
 2201           }
 2202   
 2203           /*
 2204            * Straightforward matching.
 2205            */
 2206           else {
 2207               for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 2208                   if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, matchStart, 1, this.options)))
 2209                       break;
 2210               }
 2211           }
 2212   
 2213           if (matchEnd >= 0) {
 2214               if (con.match != null) {
 2215                   con.match.setBeginning(0, matchStart);
 2216                   con.match.setEnd(0, matchEnd);
 2217               }
 2218               con.inuse = false;
 2219               return true;
 2220           } else {
 2221               con.inuse = false;
 2222               return false;
 2223           }
 2224       }
 2225   
 2226       /**
 2227        * @return -1 when not match; offset of the end of matched string when match.
 2228        */
 2229       private int matchCharacterIterator (Context con, Op op, int offset, int dx, int opts) {
 2230   
 2231   
 2232           CharacterIterator target = con.ciTarget;
 2233   
 2234   
 2235   
 2236   
 2237   
 2238   
 2239           while (true) {
 2240               if (op == null)
 2241                   return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
 2242               if (offset > con.limit || offset < con.start)
 2243                   return -1;
 2244               switch (op.type) {
 2245               case Op.CHAR:
 2246                   if (isSet(opts, IGNORE_CASE)) {
 2247                       int ch = op.getData();
 2248                       if (dx > 0) {
 2249                           if (offset >= con.limit || !matchIgnoreCase(ch,  target .setIndex(  offset ) ))
 2250                               return -1;
 2251                           offset ++;
 2252                       } else {
 2253                           int o1 = offset-1;
 2254                           if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch,  target .setIndex(  o1 ) ))
 2255                               return -1;
 2256                           offset = o1;
 2257                       }
 2258                   } else {
 2259                       int ch = op.getData();
 2260                       if (dx > 0) {
 2261                           if (offset >= con.limit || ch !=  target .setIndex(  offset ) )
 2262                               return -1;
 2263                           offset ++;
 2264                       } else {
 2265                           int o1 = offset-1;
 2266                           if (o1 >= con.limit || o1 < 0 || ch !=  target .setIndex(  o1 ) )
 2267                               return -1;
 2268                           offset = o1;
 2269                       }
 2270                   }
 2271                   op = op.next;
 2272                   break;
 2273   
 2274               case Op.DOT:
 2275                   if (dx > 0) {
 2276                       if (offset >= con.limit)
 2277                           return -1;
 2278                       int ch =  target .setIndex(  offset ) ;
 2279                       if (isSet(opts, SINGLE_LINE)) {
 2280                           if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
 2281                               offset ++;
 2282                       } else {
 2283                           if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
 2284                               ch = REUtil.composeFromSurrogates(ch,  target .setIndex(  ++offset ) );
 2285                           if (isEOLChar(ch))
 2286                               return -1;
 2287                       }
 2288                       offset ++;
 2289                   } else {
 2290                       int o1 = offset-1;
 2291                       if (o1 >= con.limit || o1 < 0)
 2292                           return -1;
 2293                       int ch =  target .setIndex(  o1 ) ;
 2294                       if (isSet(opts, SINGLE_LINE)) {
 2295                           if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
 2296                               o1 --;
 2297                       } else {
 2298                           if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
 2299                               ch = REUtil.composeFromSurrogates( target .setIndex(  --o1 ) , ch);
 2300                           if (!isEOLChar(ch))
 2301                               return -1;
 2302                       }
 2303                       offset = o1;
 2304                   }
 2305                   op = op.next;
 2306                   break;
 2307   
 2308               case Op.RANGE:
 2309               case Op.NRANGE:
 2310                   if (dx > 0) {
 2311                       if (offset >= con.limit)
 2312                           return -1;
 2313                       int ch =  target .setIndex(  offset ) ;
 2314                       if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
 2315                           ch = REUtil.composeFromSurrogates(ch,  target .setIndex(  ++offset ) );
 2316                       RangeToken tok = op.getToken();
 2317                       if (isSet(opts, IGNORE_CASE)) {
 2318                           tok = tok.getCaseInsensitiveToken();
 2319                           if (!tok.match(ch)) {
 2320                               if (ch >= 0x10000)  return -1;
 2321                               char uch;
 2322                               if (!tok.match(uch = Character.toUpperCase((char)ch))
 2323                                   && !tok.match(Character.toLowerCase(uch)))
 2324                                   return -1;
 2325                           }
 2326                       } else {
 2327                           if (!tok.match(ch))  return -1;
 2328                       }
 2329                       offset ++;
 2330                   } else {
 2331                       int o1 = offset-1;
 2332                       if (o1 >= con.limit || o1 < 0)
 2333                           return -1;
 2334                       int ch =  target .setIndex(  o1 ) ;
 2335                       if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
 2336                           ch = REUtil.composeFromSurrogates( target .setIndex(  --o1 ) , ch);
 2337                       RangeToken tok = op.getToken();
 2338                       if (isSet(opts, IGNORE_CASE)) {
 2339                           tok = tok.getCaseInsensitiveToken();
 2340                           if (!tok.match(ch)) {
 2341                               if (ch >= 0x10000)  return -1;
 2342                               char uch;
 2343                               if (!tok.match(uch = Character.toUpperCase((char)ch))
 2344                                   && !tok.match(Character.toLowerCase(uch)))
 2345                                   return -1;
 2346                           }
 2347                       } else {
 2348                           if (!tok.match(ch))  return -1;
 2349                       }
 2350                       offset = o1;
 2351                   }
 2352                   op = op.next;
 2353                   break;
 2354   
 2355               case Op.ANCHOR:
 2356                   boolean go = false;
 2357                   switch (op.getData()) {
 2358                   case '^':
 2359                       if (isSet(opts, MULTIPLE_LINES)) {
 2360                           if (!(offset == con.start
 2361                                 || offset > con.start && isEOLChar( target .setIndex(  offset-1 ) )))
 2362                               return -1;
 2363                       } else {
 2364                           if (offset != con.start)
 2365                               return -1;
 2366                       }
 2367                       break;
 2368   
 2369                   case '@':                         // Internal use only.
 2370                       // The @ always matches line beginnings.
 2371                       if (!(offset == con.start
 2372                             || offset > con.start && isEOLChar( target .setIndex(  offset-1 ) )))
 2373                           return -1;
 2374                       break;
 2375   
 2376                   case '$':
 2377                       if (isSet(opts, MULTIPLE_LINES)) {
 2378                           if (!(offset == con.limit
 2379                                 || offset < con.limit && isEOLChar( target .setIndex(  offset ) )))
 2380                               return -1;
 2381                       } else {
 2382                           if (!(offset == con.limit
 2383                                 || offset+1 == con.limit && isEOLChar( target .setIndex(  offset ) )
 2384                                 || offset+2 == con.limit &&  target .setIndex(  offset )  == CARRIAGE_RETURN
 2385                                 &&  target .setIndex(  offset+1 )  == LINE_FEED))
 2386                               return -1;
 2387                       }
 2388                       break;
 2389   
 2390                   case 'A':
 2391                       if (offset != con.start)  return -1;
 2392                       break;
 2393   
 2394                   case 'Z':
 2395                       if (!(offset == con.limit
 2396                             || offset+1 == con.limit && isEOLChar( target .setIndex(  offset ) )
 2397                             || offset+2 == con.limit &&  target .setIndex(  offset )  == CARRIAGE_RETURN
 2398                             &&  target .setIndex(  offset+1 )  == LINE_FEED))
 2399                           return -1;
 2400                       break;
 2401   
 2402                   case 'z':
 2403                       if (offset != con.limit)  return -1;
 2404                       break;
 2405   
 2406                   case 'b':
 2407                       if (con.length == 0)  return -1;
 2408                       {
 2409                           int after = getWordType(target, con.start, con.limit, offset, opts);
 2410                           if (after == WT_IGNORE)  return -1;
 2411                           int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
 2412                           if (after == before)  return -1;
 2413                       }
 2414                       break;
 2415   
 2416                   case 'B':
 2417                       if (con.length == 0)
 2418                           go = true;
 2419                       else {
 2420                           int after = getWordType(target, con.start, con.limit, offset, opts);
 2421                           go = after == WT_IGNORE
 2422                                || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
 2423                       }
 2424                       if (!go)  return -1;
 2425                       break;
 2426   
 2427                   case '<':
 2428                       if (con.length == 0 || offset == con.limit)  return -1;
 2429                       if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
 2430                           || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
 2431                           return -1;
 2432                       break;
 2433   
 2434                   case '>':
 2435                       if (con.length == 0 || offset == con.start)  return -1;
 2436                       if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
 2437                           || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
 2438                           return -1;
 2439                       break;
 2440                   } // switch anchor type
 2441                   op = op.next;
 2442                   break;
 2443   
 2444               case Op.BACKREFERENCE:
 2445                   {
 2446                       int refno = op.getData();
 2447                       if (refno <= 0 || refno >= this.nofparen)
 2448                           throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
 2449                       if (con.match.getBeginning(refno) < 0
 2450                           || con.match.getEnd(refno) < 0)
 2451                           return -1;                // ********
 2452                       int o2 = con.match.getBeginning(refno);
 2453                       int literallen = con.match.getEnd(refno)-o2;
 2454                       if (!isSet(opts, IGNORE_CASE)) {
 2455                           if (dx > 0) {
 2456                               if (!regionMatches(target, offset, con.limit, o2, literallen))
 2457                                   return -1;
 2458                               offset += literallen;
 2459                           } else {
 2460                               if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
 2461                                   return -1;
 2462                               offset -= literallen;
 2463                           }
 2464                       } else {
 2465                           if (dx > 0) {
 2466                               if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
 2467                                   return -1;
 2468                               offset += literallen;
 2469                           } else {
 2470                               if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
 2471                                                            o2, literallen))
 2472                                   return -1;
 2473                               offset -= literallen;
 2474                           }
 2475                       }
 2476                   }
 2477                   op = op.next;
 2478                   break;
 2479               case Op.STRING:
 2480                   {
 2481                       String literal = op.getString();
 2482                       int literallen = literal.length();
 2483                       if (!isSet(opts, IGNORE_CASE)) {
 2484                           if (dx > 0) {
 2485                               if (!regionMatches(target, offset, con.limit, literal, literallen))
 2486                                   return -1;
 2487                               offset += literallen;
 2488                           } else {
 2489                               if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
 2490                                   return -1;
 2491                               offset -= literallen;
 2492                           }
 2493                       } else {
 2494                           if (dx > 0) {
 2495                               if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
 2496                                   return -1;
 2497                               offset += literallen;
 2498                           } else {
 2499                               if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
 2500                                                            literal, literallen))
 2501                                   return -1;
 2502                               offset -= literallen;
 2503                           }
 2504                       }
 2505                   }
 2506                   op = op.next;
 2507                   break;
 2508   
 2509               case Op.CLOSURE:
 2510                   {
 2511                       /*
 2512                        * Saves current position to avoid
 2513                        * zero-width repeats.
 2514                        */
 2515                       int id = op.getData();
 2516                       if (id >= 0) {
 2517                           int previousOffset = con.offsets[id];
 2518                           if (previousOffset < 0 || previousOffset != offset) {
 2519                               con.offsets[id] = offset;
 2520                           } else {
 2521                               con.offsets[id] = -1;
 2522                               op = op.next;
 2523                               break;
 2524                           }
 2525                       }
 2526                       
 2527                       int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
 2528                       if (id >= 0)  con.offsets[id] = -1;
 2529                       if (ret >= 0)  return ret;
 2530                       op = op.next;
 2531                   }
 2532                   break;
 2533   
 2534               case Op.QUESTION:
 2535                   {
 2536                       int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
 2537                       if (ret >= 0)  return ret;
 2538                       op = op.next;
 2539                   }
 2540                   break;
 2541   
 2542               case Op.NONGREEDYCLOSURE:
 2543               case Op.NONGREEDYQUESTION:
 2544                   {
 2545                       int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
 2546                       if (ret >= 0)  return ret;
 2547                       op = op.getChild();
 2548                   }
 2549                   break;
 2550   
 2551               case Op.UNION:
 2552                   for (int i = 0;  i < op.size();  i ++) {
 2553                       int ret = this. matchCharacterIterator (con, op.elementAt(i), offset, dx, opts);
 2554                       if (DEBUG) {
 2555                           System.err.println("UNION: "+i+", ret="+ret);
 2556                       }
 2557                       if (ret >= 0)  return ret;
 2558                   }
 2559                   return -1;
 2560   
 2561               case Op.CAPTURE:
 2562                   int refno = op.getData();
 2563                   if (con.match != null && refno > 0) {
 2564                       int save = con.match.getBeginning(refno);
 2565                       con.match.setBeginning(refno, offset);
 2566                       int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
 2567                       if (ret < 0)  con.match.setBeginning(refno, save);
 2568                       return ret;
 2569                   } else if (con.match != null && refno < 0) {
 2570                       int index = -refno;
 2571                       int save = con.match.getEnd(index);
 2572                       con.match.setEnd(index, offset);
 2573                       int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
 2574                       if (ret < 0)  con.match.setEnd(index, save);
 2575                       return ret;
 2576                   }
 2577                   op = op.next;
 2578                   break;
 2579   
 2580               case Op.LOOKAHEAD:
 2581                   if (0 > this. matchCharacterIterator (con, op.getChild(), offset, 1, opts))  return -1;
 2582                   op = op.next;
 2583                   break;
 2584               case Op.NEGATIVELOOKAHEAD:
 2585                   if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, 1, opts))  return -1;
 2586                   op = op.next;
 2587                   break;
 2588               case Op.LOOKBEHIND:
 2589                   if (0 > this. matchCharacterIterator (con, op.getChild(), offset, -1, opts))  return -1;
 2590                   op = op.next;
 2591                   break;
 2592               case Op.NEGATIVELOOKBEHIND:
 2593                   if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, -1, opts))  return -1;
 2594                   op = op.next;
 2595                   break;
 2596   
 2597               case Op.INDEPENDENT:
 2598                   {
 2599                       int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
 2600                       if (ret < 0)  return ret;
 2601                       offset = ret;
 2602                       op = op.next;
 2603                   }
 2604                   break;
 2605   
 2606               case Op.MODIFIER:
 2607                   {
 2608                       int localopts = opts;
 2609                       localopts |= op.getData();
 2610                       localopts &= ~op.getData2();
 2611                       //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
 2612                       int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, localopts);
 2613                       if (ret < 0)  return ret;
 2614                       offset = ret;
 2615                       op = op.next;
 2616                   }
 2617                   break;
 2618   
 2619               case Op.CONDITION:
 2620                   {
 2621                       Op.ConditionOp cop = (Op.ConditionOp)op;
 2622                       boolean matchp = false;
 2623                       if (cop.refNumber > 0) {
 2624                           if (cop.refNumber >= this.nofparen)
 2625                               throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
 2626                           matchp = con.match.getBeginning(cop.refNumber) >= 0
 2627                                    && con.match.getEnd(cop.refNumber) >= 0;
 2628                       } else {
 2629                           matchp = 0 <= this. matchCharacterIterator (con, cop.condition, offset, dx, opts);
 2630                       }
 2631   
 2632                       if (matchp) {
 2633                           op = cop.yes;
 2634                       } else if (cop.no != null) {
 2635                           op = cop.no;
 2636                       } else {
 2637                           op = cop.next;
 2638                       }
 2639                   }
 2640                   break;
 2641   
 2642               default:
 2643                   throw new RuntimeException("Unknown operation type: "+op.type);
 2644               } // switch (op.type)
 2645           } // while
 2646       }
 2647   
 2648       private static final int getPreviousWordType(CharacterIterator  target, int begin, int end,
 2649                                                    int offset, int opts) {
 2650           int ret = getWordType(target, begin, end, --offset, opts);
 2651           while (ret == WT_IGNORE)
 2652               ret = getWordType(target, begin, end, --offset, opts);
 2653           return ret;
 2654       }
 2655   
 2656       private static final int getWordType(CharacterIterator  target, int begin, int end,
 2657                                            int offset, int opts) {
 2658           if (offset < begin || offset >= end)  return WT_OTHER;
 2659           return getWordType0( target .setIndex(  offset ) , opts);
 2660       }
 2661   
 2662   
 2663   
 2664       private static final boolean regionMatches(CharacterIterator  target, int offset, int limit,
 2665                                                  String part, int partlen) {
 2666           if (offset < 0)  return false;
 2667           if (limit-offset < partlen)
 2668               return false;
 2669           int i = 0;
 2670           while (partlen-- > 0) {
 2671               if ( target .setIndex(  offset++ )  != part.charAt(i++))
 2672                   return false;
 2673           }
 2674           return true;
 2675       }
 2676   
 2677       private static final boolean regionMatches(CharacterIterator  target, int offset, int limit,
 2678                                                  int offset2, int partlen) {
 2679           if (offset < 0)  return false;
 2680           if (limit-offset < partlen)
 2681               return false;
 2682           int i = offset2;
 2683           while (partlen-- > 0) {
 2684               if ( target .setIndex(  offset++ )  !=  target .setIndex(  i++ ) )
 2685                   return false;
 2686           }
 2687           return true;
 2688       }
 2689   
 2690       /**
 2691        * @see java.lang.String#regionMatches
 2692        */
 2693       private static final boolean regionMatchesIgnoreCase(CharacterIterator  target, int offset, int limit,
 2694                                                            String part, int partlen) {
 2695           if (offset < 0)  return false;
 2696           if (limit-offset < partlen)
 2697               return false;
 2698           int i = 0;
 2699           while (partlen-- > 0) {
 2700               char ch1 =  target .setIndex(  offset++ ) ;
 2701               char ch2 = part.charAt(i++);
 2702               if (ch1 == ch2)
 2703                   continue;
 2704               char uch1 = Character.toUpperCase(ch1);
 2705               char uch2 = Character.toUpperCase(ch2);
 2706               if (uch1 == uch2)
 2707                   continue;
 2708               if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
 2709                   return false;
 2710           }
 2711           return true;
 2712       }
 2713   
 2714       private static final boolean regionMatchesIgnoreCase(CharacterIterator  target, int offset, int limit,
 2715                                                            int offset2, int partlen) {
 2716           if (offset < 0)  return false;
 2717           if (limit-offset < partlen)
 2718               return false;
 2719           int i = offset2;
 2720           while (partlen-- > 0) {
 2721               char ch1 =  target .setIndex(  offset++ ) ;
 2722               char ch2 =  target .setIndex(  i++ ) ;
 2723               if (ch1 == ch2)
 2724                   continue;
 2725               char uch1 = Character.toUpperCase(ch1);
 2726               char uch2 = Character.toUpperCase(ch2);
 2727               if (uch1 == uch2)
 2728                   continue;
 2729               if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
 2730                   return false;
 2731           }
 2732           return true;
 2733       }
 2734   
 2735   
 2736   
 2737   
 2738       // ================================================================
 2739   
 2740       /**
 2741        * A regular expression.
 2742        * @serial
 2743        */
 2744       String regex;
 2745       /**
 2746        * @serial
 2747        */
 2748       int options;
 2749   
 2750       /**
 2751        * The number of parenthesis in the regular expression.
 2752        * @serial
 2753        */
 2754       int nofparen;
 2755       /**
 2756        * Internal representation of the regular expression.
 2757        * @serial
 2758        */
 2759       Token tokentree;
 2760   
 2761       boolean hasBackReferences = false;
 2762   
 2763       transient int minlength;
 2764       transient Op operations = null;
 2765       transient int numberOfClosures;
 2766       transient Context context = null;
 2767       transient RangeToken firstChar = null;
 2768   
 2769       transient String fixedString = null;
 2770       transient int fixedStringOptions;
 2771       transient BMPattern fixedStringTable = null;
 2772       transient boolean fixedStringOnly = false;
 2773   
 2774   
 2775       static final class Context {
 2776           CharacterIterator ciTarget;
 2777           String strTarget;
 2778           char[] charTarget;
 2779           int start;
 2780           int limit;
 2781           int length;
 2782           Match match;
 2783           boolean inuse = false;
 2784           int[] offsets;
 2785   
 2786           Context() {
 2787           }
 2788   
 2789           private void resetCommon(int nofclosures) {
 2790               this.length = this.limit-this.start;
 2791               this.inuse = true;
 2792               this.match = null;
 2793               if (this.offsets == null || this.offsets.length != nofclosures)
 2794                   this.offsets = new int[nofclosures];
 2795               for (int i = 0;  i < nofclosures;  i ++)  this.offsets[i] = -1;
 2796           }
 2797           void reset(CharacterIterator target, int start, int limit, int nofclosures) {
 2798               this.ciTarget = target;
 2799               this.start = start;
 2800               this.limit = limit;
 2801               this.resetCommon(nofclosures);
 2802           }
 2803           void reset(String target, int start, int limit, int nofclosures) {
 2804               this.strTarget = target;
 2805               this.start = start;
 2806               this.limit = limit;
 2807               this.resetCommon(nofclosures);
 2808           }
 2809           void reset(char[] target, int start, int limit, int nofclosures) {
 2810               this.charTarget = target;
 2811               this.start = start;
 2812               this.limit = limit;
 2813               this.resetCommon(nofclosures);
 2814           }
 2815       }
 2816   
 2817       /**
 2818        * Prepares for matching.  This method is called just before starting matching.
 2819        */
 2820       void prepare() {
 2821           if (Op.COUNT)  Op.nofinstances = 0;
 2822           this.compile(this.tokentree);
 2823           /*
 2824           if  (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
 2825               Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
 2826               anchor.next = this.operations;
 2827               this.operations = anchor;
 2828           }
 2829           */
 2830           if (Op.COUNT)  System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
 2831   
 2832           this.minlength = this.tokentree.getMinLength();
 2833   
 2834           this.firstChar = null;
 2835           if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
 2836               && !isSet(this.options, XMLSCHEMA_MODE)) {
 2837               RangeToken firstChar = Token.createRange();
 2838               int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
 2839               if (fresult == Token.FC_TERMINAL) {
 2840                   firstChar.compactRanges();
 2841                   this.firstChar = firstChar;
 2842                   if (DEBUG)
 2843                       System.err.println("DEBUG: Use the first character optimization: "+firstChar);
 2844               }
 2845           }
 2846   
 2847           if (this.operations != null
 2848               && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
 2849               && this.operations.next == null) {
 2850               if (DEBUG)
 2851                   System.err.print(" *** Only fixed string! *** ");
 2852               this.fixedStringOnly = true;
 2853               if (this.operations.type == Op.STRING)
 2854                   this.fixedString = this.operations.getString();
 2855               else if (this.operations.getData() >= 0x10000) { // Op.CHAR
 2856                   this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
 2857               } else {
 2858                   char[] ac = new char[1];
 2859                   ac[0] = (char)this.operations.getData();
 2860                   this.fixedString = new String(ac);
 2861               }
 2862               this.fixedStringOptions = this.options;
 2863               this.fixedStringTable = new BMPattern(this.fixedString, 256,
 2864                                                     isSet(this.fixedStringOptions, IGNORE_CASE));
 2865           } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
 2866                      && !isSet(this.options, XMLSCHEMA_MODE)) {
 2867               Token.FixedStringContainer container = new Token.FixedStringContainer();
 2868               this.tokentree.findFixedString(container, this.options);
 2869               this.fixedString = container.token == null ? null : container.token.getString();
 2870               this.fixedStringOptions = container.options;
 2871               if (this.fixedString != null && this.fixedString.length() < 2)
 2872                   this.fixedString = null;
 2873               // This pattern has a fixed string of which length is more than one.
 2874               if (this.fixedString != null) {
 2875                   this.fixedStringTable = new BMPattern(this.fixedString, 256,
 2876                                                         isSet(this.fixedStringOptions, IGNORE_CASE));
 2877                   if (DEBUG) {
 2878                       System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
 2879                                          +"/" //+this.fixedString
 2880                                          +"/"+REUtil.createOptionString(this.fixedStringOptions));
 2881                       System.err.print("String: ");
 2882                       REUtil.dumpString(this.fixedString);
 2883                   }
 2884               }
 2885           }
 2886       }
 2887   
 2888       /**
 2889        * An option.
 2890        * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
 2891        * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
 2892        * does not capture.
 2893        *
 2894        * @see #RegularExpression(java.lang.String,int)
 2895        * @see #setPattern(java.lang.String,int)
 2896       static final int MARK_PARENS = 1<<0;
 2897        */
 2898   
 2899       /**
 2900        * "i"
 2901        */
 2902       static final int IGNORE_CASE = 1<<1;
 2903   
 2904       /**
 2905        * "s"
 2906        */
 2907       static final int SINGLE_LINE = 1<<2;
 2908   
 2909       /**
 2910        * "m"
 2911        */
 2912       static final int MULTIPLE_LINES = 1<<3;
 2913   
 2914       /**
 2915        * "x"
 2916        */
 2917       static final int EXTENDED_COMMENT = 1<<4;
 2918   
 2919       /**
 2920        * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
 2921        *
 2922        * @see #RegularExpression(java.lang.String,int)
 2923        * @see #setPattern(java.lang.String,int)
 2924        * @see #UNICODE_WORD_BOUNDARY
 2925        */
 2926       static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
 2927   
 2928       /**
 2929        * An option.
 2930        * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \&lt; \></kbd></span>.
 2931        * <p>By default, the engine considers a position between a word character
 2932        * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
 2933        * is a word boundary.
 2934        * <p>By this option, the engine checks word boundaries with the method of
 2935        * 'Unicode Regular Expression Guidelines' Revision 4.
 2936        *
 2937        * @see #RegularExpression(java.lang.String,int)
 2938        * @see #setPattern(java.lang.String,int)
 2939        */
 2940       static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
 2941   
 2942       /**
 2943        * "H"
 2944        */
 2945       static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
 2946       /**
 2947        * "F"
 2948        */
 2949       static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
 2950       /**
 2951        * "X". XML Schema mode.
 2952        */
 2953       static final int XMLSCHEMA_MODE = 1<<9;
 2954       /**
 2955        * ",".
 2956        */
 2957       static final int SPECIAL_COMMA = 1<<10;
 2958   
 2959   
 2960       private static final boolean isSet(int options, int flag) {
 2961           return (options & flag) == flag;
 2962       }
 2963   
 2964       /**
 2965        * Creates a new RegularExpression instance.
 2966        *
 2967        * @param regex A regular expression
 2968        * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
 2969        */
 2970       public RegularExpression(String regex) throws ParseException {
 2971           this.setPattern(regex, null);
 2972       }
 2973   
 2974       /**
 2975        * Creates a new RegularExpression instance with options.
 2976        *
 2977        * @param regex A regular expression
 2978        * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
 2979        * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
 2980        */
 2981       public RegularExpression(String regex, String options) throws ParseException {
 2982           this.setPattern(regex, options);
 2983       }
 2984   
 2985       RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) {
 2986           this.regex = regex;
 2987           this.tokentree = tok;
 2988           this.nofparen = parens;
 2989           this.options = options;
 2990           this.hasBackReferences = hasBackReferences;
 2991       }
 2992   
 2993       /**
 2994        *
 2995        */
 2996       public void setPattern(String newPattern) throws ParseException {
 2997           this.setPattern(newPattern, this.options);
 2998       }
 2999   
 3000       private void setPattern(String newPattern, int options) throws ParseException {
 3001           this.regex = newPattern;
 3002           this.options = options;
 3003           RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
 3004                            ? new ParserForXMLSchema() : new RegexParser();
 3005           this.tokentree = rp.parse(this.regex, this.options);
 3006           this.nofparen = rp.parennumber;
 3007           this.hasBackReferences = rp.hasBackReferences;
 3008   
 3009           this.operations = null;
 3010           this.context = null;
 3011       }
 3012       /**
 3013        *
 3014        */
 3015       public void setPattern(String newPattern, String options) throws ParseException {
 3016           this.setPattern(newPattern, REUtil.parseOptions(options));
 3017       }
 3018   
 3019       /**
 3020        *
 3021        */
 3022       public String getPattern() {
 3023           return this.regex;
 3024       }
 3025   
 3026       /**
 3027        * Represents this instence in String.
 3028        */
 3029       public String toString() {
 3030           return this.tokentree.toString(this.options);
 3031       }
 3032   
 3033       /**
 3034        * Returns a option string.
 3035        * The order of letters in it may be different from a string specified
 3036        * in a constructor or <code>setPattern()</code>.
 3037        *
 3038        * @see #RegularExpression(java.lang.String,java.lang.String)
 3039        * @see #setPattern(java.lang.String,java.lang.String)
 3040        */
 3041       public String getOptions() {
 3042           return REUtil.createOptionString(this.options);
 3043       }
 3044   
 3045       /**
 3046        *  Return true if patterns are the same and the options are equivalent.
 3047        */
 3048       public boolean equals(Object obj) {
 3049           if (obj == null)  return false;
 3050           if (!(obj instanceof RegularExpression))
 3051               return false;
 3052           RegularExpression r = (RegularExpression)obj;
 3053           return this.regex.equals(r.regex) && this.options == r.options;
 3054       }
 3055   
 3056       boolean equals(String pattern, int options) {
 3057           return this.regex.equals(pattern) && this.options == options;
 3058       }
 3059   
 3060       /**
 3061        *
 3062        */
 3063       public int hashCode() {
 3064           return (this.regex+"/"+this.getOptions()).hashCode();
 3065       }
 3066   
 3067       /**
 3068        * Return the number of regular expression groups.
 3069        * This method returns 1 when the regular expression has no capturing-parenthesis.
 3070        *
 3071        */
 3072       public int getNumberOfGroups() {
 3073           return this.nofparen;
 3074       }
 3075   
 3076       // ================================================================
 3077   
 3078       private static final int WT_IGNORE = 0;
 3079       private static final int WT_LETTER = 1;
 3080       private static final int WT_OTHER = 2;
 3081       private static final int getWordType0(char ch, int opts) {
 3082           if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
 3083               if (isSet(opts, USE_UNICODE_CATEGORY)) {
 3084                   return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
 3085               }
 3086               return isWordChar(ch) ? WT_LETTER : WT_OTHER;
 3087           }
 3088   
 3089           switch (Character.getType(ch)) {
 3090           case Character.UPPERCASE_LETTER:      // L
 3091           case Character.LOWERCASE_LETTER:      // L
 3092           case Character.TITLECASE_LETTER:      // L
 3093           case Character.MODIFIER_LETTER:       // L
 3094           case Character.OTHER_LETTER:          // L
 3095           case Character.LETTER_NUMBER:         // N
 3096           case Character.DECIMAL_DIGIT_NUMBER:  // N
 3097           case Character.OTHER_NUMBER:          // N
 3098           case Character.COMBINING_SPACING_MARK: // Mc
 3099               return WT_LETTER;
 3100   
 3101           case Character.FORMAT:                // Cf
 3102           case Character.NON_SPACING_MARK:      // Mn
 3103           case Character.ENCLOSING_MARK:        // Mc
 3104               return WT_IGNORE;
 3105   
 3106           case Character.CONTROL:               // Cc
 3107               switch (ch) {
 3108               case '\t':
 3109               case '\n':
 3110               case '\u000B':
 3111               case '\f':
 3112               case '\r':
 3113                   return WT_OTHER;
 3114               default:
 3115                   return WT_IGNORE;
 3116               }
 3117   
 3118           default:
 3119               return WT_OTHER;
 3120           }
 3121       }
 3122   
 3123       // ================================================================
 3124   
 3125       static final int LINE_FEED = 0x000A;
 3126       static final int CARRIAGE_RETURN = 0x000D;
 3127       static final int LINE_SEPARATOR = 0x2028;
 3128       static final int PARAGRAPH_SEPARATOR = 0x2029;
 3129   
 3130       private static final boolean isEOLChar(int ch) {
 3131           return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
 3132           || ch == PARAGRAPH_SEPARATOR;
 3133       }
 3134   
 3135       private static final boolean isWordChar(int ch) { // Legacy word characters
 3136           if (ch == '_')  return true;
 3137           if (ch < '0')  return false;
 3138           if (ch > 'z')  return false;
 3139           if (ch <= '9')  return true;
 3140           if (ch < 'A')  return false;
 3141           if (ch <= 'Z')  return true;
 3142           if (ch < 'a')  return false;
 3143           return true;
 3144       }
 3145   
 3146       private static final boolean matchIgnoreCase(int chardata, int ch) {
 3147           if (chardata == ch)  return true;
 3148           if (chardata > 0xffff || ch > 0xffff)  return false;
 3149           char uch1 = Character.toUpperCase((char)chardata);
 3150           char uch2 = Character.toUpperCase((char)ch);
 3151           if (uch1 == uch2)  return true;
 3152           return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
 3153       }
 3154   }

Home » Xerces-J-src.2.9.1 » org.apache.xerces » impl » xpath » regex » [javadoc | source]