Home » xmlbeans-2.5.0-src » org.apache.xmlbeans.impl » regex » [javadoc | source]

    1   /*   Copyright 2004 The Apache Software Foundation
    2    *
    3    *   Licensed under the Apache License, Version 2.0 (the "License");
    4    *   you may not use this file except in compliance with the License.
    5    *   You may obtain a copy of the License at
    6    *
    7    *       http://www.apache.org/licenses/LICENSE-2.0
    8    *
    9    *   Unless required by applicable law or agreed to in writing, software
   10    *   distributed under the License is distributed on an "AS IS" BASIS,
   11    *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   12    *   See the License for the specific language governing permissions and
   13    *  limitations under the License.
   14    */
   15   
   16   package org.apache.xmlbeans.impl.regex;
   17   
   18   import java.text.CharacterIterator;
   19   
   20   /**
   21    * A regular expression matching engine using Non-deterministic Finite Automaton (NFA).
   22    * This engine does not conform to the POSIX regular expression.
   23    *
   24    * <hr width="50%">
   25    * <h3>How to use</h3>
   26    *
   27    * <dl>
   28    *   <dt>A. Standard way
   29    *   <dd>
   30    * <pre>
   31    * RegularExpression re = new RegularExpression(<var>regex</var>);
   32    * if (re.matches(text)) { ... }
   33    * </pre>
   34    *
   35    *   <dt>B. Capturing groups
   36    *   <dd>
   37    * <pre>
   38    * RegularExpression re = new RegularExpression(<var>regex</var>);
   39    * Match match = new Match();
   40    * if (re.matches(text, match)) {
   41    *     ... // You can refer captured texts with methods of the <code>Match</code> class.
   42    * }
   43    * </pre>
   44    *
   45    * </dl>
   46    *
   47    * <h4>Case-insensitive matching</h4>
   48    * <pre>
   49    * RegularExpression re = new RegularExpression(<var>regex</var>, "i");
   50    * if (re.matches(text) >= 0) { ...}
   51    * </pre>
   52    *
   53    * <h4>Options</h4>
   54    * <p>You can specify options to <a href="#RegularExpression(java.lang.String, java.lang.String)"><code>RegularExpression(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>
   55    *    or <a href="#setPattern(java.lang.String, java.lang.String)"><code>setPattern(</code><var>regex</var><code>, </code><var>options</var><code>)</code></a>.
   56    *    This <var>options</var> parameter consists of the following characters.
   57    * </p>
   58    * <dl>
   59    *   <dt><a name="I_OPTION"><code>"i"</code></a>
   60    *   <dd>This option indicates case-insensitive matching.
   61    *   <dt><a name="M_OPTION"><code>"m"</code></a>
   62    *   <dd class="REGEX"><kbd>^</kbd> and <kbd>$</kbd> consider the EOL characters within the text.
   63    *   <dt><a name="S_OPTION"><code>"s"</code></a>
   64    *   <dd class="REGEX"><kbd>.</kbd> matches any one character.
   65    *   <dt><a name="U_OPTION"><code>"u"</code></a>
   66    *   <dd class="REGEX">Redefines <Kbd>\d \D \w \W \s \S \b \B \&lt; \></kbd> as becoming to Unicode.
   67    *   <dt><a name="W_OPTION"><code>"w"</code></a>
   68    *   <dd class="REGEX">By this option, <kbd>\b \B \&lt; \></kbd> are processed with the method of
   69    *      'Unicode Regular Expression Guidelines' Revision 4.
   70    *      When "w" and "u" are specified at the same time,
   71    *      <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
   72    *   <dt><a name="COMMA_OPTION"><code>","</code></a>
   73    *   <dd>The parser treats a comma in a character class as a range separator.
   74    *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
   75    *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
   76    *
   77    *   <dt><a name="X_OPTION"><code>"X"</code></a>
   78    *   <dd class="REGEX">
   79    *       By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
   80    *       The <code>match()</code> method does not do subsring matching
   81    *       but entire string matching.
   82    *
   83    * </dl>
   84    * 
   85    * <hr width="50%">
   86    * <h3>Syntax</h3>
   87    * <table border="1" bgcolor="#ddeeff">
   88    *   <tr>
   89    *    <td>
   90    *     <h4>Differences from the Perl 5 regular expression</h4>
   91    *     <ul>
   92    *      <li>There is 6-digit hexadecimal character representation  (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
   93    *      <li>Supports subtraction, union, and intersection operations for character classes.
   94    *      <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
   95    *          <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
   96    *          <kbd>\u005c u</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
   97    *          <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
   98    *          <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
   99    *     </ul>
  100    *    </td>
  101    *   </tr>
  102    * </table>
  103    *
  104    * <P>Meta characters are `<KBD>. * + ? { [ ( ) | \ ^ $</KBD>'.</P>
  105    * <ul>
  106    *   <li>Character
  107    *     <dl>
  108    *       <dt class="REGEX"><kbd>.</kbd> (A period)
  109    *       <dd>Matches any one character except the following characters.
  110    *       <dd>LINE FEED (U+000A), CARRIAGE RETURN (U+000D),
  111    *           PARAGRAPH SEPARATOR (U+2029), LINE SEPARATOR (U+2028)
  112    *       <dd>This expression matches one code point in Unicode. It can match a pair of surrogates.
  113    *       <dd>When <a href="#S_OPTION">the "s" option</a> is specified,
  114    *           it matches any character including the above four characters.
  115    *
  116    *       <dt class="REGEX"><Kbd>\e \f \n \r \t</kbd>
  117    *       <dd>Matches ESCAPE (U+001B), FORM FEED (U+000C), LINE FEED (U+000A),
  118    *           CARRIAGE RETURN (U+000D), HORIZONTAL TABULATION (U+0009)
  119    *
  120    *       <dt class="REGEX"><kbd>\c</kbd><var>C</var>
  121    *       <dd>Matches a control character.
  122    *           The <var>C</var> must be one of '<kbd>@</kbd>', '<kbd>A</kbd>'-'<kbd>Z</kbd>',
  123    *           '<kbd>[</kbd>', '<kbd>\u005c</kbd>', '<kbd>]</kbd>', '<kbd>^</kbd>', '<kbd>_</kbd>'.
  124    *           It matches a control character of which the character code is less than
  125    *           the character code of the <var>C</var> by 0x0040.
  126    *       <dd class="REGEX">For example, a <kbd>\cJ</kbd> matches a LINE FEED (U+000A),
  127    *           and a <kbd>\c[</kbd> matches an ESCAPE (U+001B).
  128    *
  129    *       <dt class="REGEX">a non-meta character
  130    *       <dd>Matches the character.
  131    *
  132    *       <dt class="REGEX"><KBD>\</KBD> + a meta character
  133    *       <dd>Matches the meta character.
  134    *
  135    *       <dt class="REGEX"><kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>
  136    *       <dd>Matches a character of which code point is <var>HH</var> (Hexadecimal) in Unicode.
  137    *           You can write just 2 digits for <kbd>\u005cx</kbd><var>HH</var>, and
  138    *           variable length digits for <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd>.
  139    *
  140    *       <!--
  141    *       <dt class="REGEX"><kbd>\u005c u</kbd><var>HHHH</var>
  142    *       <dd>Matches a character of which code point is <var>HHHH</var> (Hexadecimal) in Unicode.
  143    *       -->
  144    *
  145    *       <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
  146    *       <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
  147    *
  148    *       <dt class="REGEX"><kbd>\g</kbd>
  149    *       <dd>Matches a grapheme.
  150    *       <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
  151    *
  152    *       <dt class="REGEX"><kbd>\X</kbd>
  153    *       <dd class="REGEX">Matches a combining character sequence.
  154    *       It is equivalent to <kbd>(?:\PM\pM*)</kbd>
  155    *     </dl>
  156    *   </li>
  157    *
  158    *   <li>Character class
  159    *     <dl>
  160   + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
  161   + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
  162    *       <dd>Positive character class.  It matches a character in ranges.
  163    *       <dd><var>R<sub>n</sub></var>:
  164    *       <ul>
  165    *         <li class="REGEX">A character (including <Kbd>\e \f \n \r \t</kbd> <kbd>\u005cx</kbd><var>HH</var> <kbd>\u005cx{</kbd><var>HHHH</var><kbd>}</kbd> <!--kbd>\u005c u</kbd><var>HHHH</var--> <kbd>\u005cv</kbd><var>HHHHHH</var>)
  166    *             <p>This range matches the character.
  167    *         <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
  168    *             <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
  169   + *         <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
  170   + *             and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
  171    *             <p>...
  172    *         <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
  173    *             <p>These expressions specifies the same ranges as the following expressions.
  174    *       </ul>
  175    *       <p class="REGEX">Enumerated ranges are merged (union operation).
  176    *          <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
  177    *
  178    *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
  179    *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
  180    *       <dd>Negative character class.  It matches a character not in ranges.
  181    *
  182    *       <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
  183    *       (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
  184    *       <dd>Subtraction or union or intersection for character classes.
  185    *       <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
  186    *       <dd>The result of this operations is a <u>positive character class</u>
  187    *           even if an expression includes any negative character classes.
  188    *           You have to take care on this in case-insensitive matching.
  189    *           For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
  190    *           which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
  191    *           But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
  192    *           it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
  193    *           though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
  194    *
  195    *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
  196    *       <dd>Character class subtraction for the XML Schema.
  197    *           You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
  198    *           
  199    *       <dt class="REGEX"><kbd>\d</kbd>
  200    *       <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
  201    *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  202    *           <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
  203    *
  204    *       <dt class="REGEX"><kbd>\D</kbd>
  205    *       <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
  206    *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  207    *           <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
  208    *
  209    *       <dt class="REGEX"><kbd>\s</kbd>
  210    *       <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
  211    *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  212    *           <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
  213    *
  214    *       <dt class="REGEX"><kbd>\S</kbd>
  215    *       <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
  216    *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  217    *           <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
  218    *
  219    *       <dt class="REGEX"><kbd>\w</kbd>
  220    *       <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
  221    *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  222    *           <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
  223    *
  224    *       <dt class="REGEX"><kbd>\W</kbd>
  225    *       <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
  226    *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  227    *           <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
  228    *
  229    *       <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
  230    *       <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
  231    *       The following names are available:
  232    *       <dl>
  233    *         <dt>Unicode General Categories:
  234    *         <dd><kbd>
  235    *       L, M, N, Z, C, P, S, Lu, Ll, Lt, Lm, Lo, Mn, Me, Mc, Nd, Nl, No, Zs, Zl, Zp,
  236    *       Cc, Cf, Cn, Co, Cs, Pd, Ps, Pe, Pc, Po, Sm, Sc, Sk, So,
  237    *         </kbd>
  238    *         <dd>(Currently the Cn category includes U+10000-U+10FFFF characters)
  239    *         <dt>Unicode Blocks:
  240    *         <dd><kbd>
  241    *       Basic Latin, Latin-1 Supplement, Latin Extended-A, Latin Extended-B,
  242    *       IPA Extensions, Spacing Modifier Letters, Combining Diacritical Marks, Greek,
  243    *       Cyrillic, Armenian, Hebrew, Arabic, Devanagari, Bengali, Gurmukhi, Gujarati,
  244    *       Oriya, Tamil, Telugu, Kannada, Malayalam, Thai, Lao, Tibetan, Georgian,
  245    *       Hangul Jamo, Latin Extended Additional, Greek Extended, General Punctuation,
  246    *       Superscripts and Subscripts, Currency Symbols, Combining Marks for Symbols,
  247    *       Letterlike Symbols, Number Forms, Arrows, Mathematical Operators,
  248    *       Miscellaneous Technical, Control Pictures, Optical Character Recognition,
  249    *       Enclosed Alphanumerics, Box Drawing, Block Elements, Geometric Shapes,
  250    *       Miscellaneous Symbols, Dingbats, CJK Symbols and Punctuation, Hiragana,
  251    *       Katakana, Bopomofo, Hangul Compatibility Jamo, Kanbun,
  252    *       Enclosed CJK Letters and Months, CJK Compatibility, CJK Unified Ideographs,
  253    *       Hangul Syllables, High Surrogates, High Private Use Surrogates, Low Surrogates,
  254    *       Private Use, CJK Compatibility Ideographs, Alphabetic Presentation Forms,
  255    *       Arabic Presentation Forms-A, Combining Half Marks, CJK Compatibility Forms,
  256    *       Small Form Variants, Arabic Presentation Forms-B, Specials,
  257    *       Halfwidth and Fullwidth Forms
  258    *         </kbd>
  259    *         <dt>Others:
  260    *         <dd><kbd>ALL</kbd> (Equivalent to <kbd>[\u005cu0000-\u005cv10FFFF]</kbd>)
  261    *         <dd><kbd>ASSGINED</kbd> (<kbd>\p{ASSIGNED}</kbd> is equivalent to <kbd>\P{Cn}</kbd>)
  262    *         <dd><kbd>UNASSGINED</kbd>
  263    *             (<kbd>\p{UNASSIGNED}</kbd> is equivalent to <kbd>\p{Cn}</kbd>)
  264    *       </dl>
  265    *
  266    *       <dt class="REGEX"><kbd>\P{</kbd><var>name</var><kbd>}</kbd>
  267    *       <dd>Matches one character not in the specified General Category or the specified Block.
  268    *     </dl>
  269    *   </li>
  270    *
  271    *   <li>Selection and Quantifier
  272    *     <dl>
  273    *       <dt class="REGEX"><VAR>X</VAR><kbd>|</kbd><VAR>Y</VAR>
  274    *       <dd>...
  275    *
  276    *       <dt class="REGEX"><VAR>X</VAR><kbd>*</KBD>
  277    *       <dd>Matches 0 or more <var>X</var>.
  278    *
  279    *       <dt class="REGEX"><VAR>X</VAR><kbd>+</KBD>
  280    *       <dd>Matches 1 or more <var>X</var>.
  281    *
  282    *       <dt class="REGEX"><VAR>X</VAR><kbd>?</KBD>
  283    *       <dd>Matches 0 or 1 <var>X</var>.
  284    *
  285    *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>number</var><kbd>}</kbd>
  286    *       <dd>Matches <var>number</var> times.
  287    *
  288    *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}</kbd>
  289    *       <dd>...
  290    *
  291    *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}</kbd>
  292    *       <dd>...
  293    *
  294    *       <dt class="REGEX"><VAR>X</VAR><kbd>*?</kbd>
  295    *       <dt class="REGEX"><VAR>X</VAR><kbd>+?</kbd>
  296    *       <dt class="REGEX"><VAR>X</VAR><kbd>??</kbd>
  297    *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,}?</kbd>
  298    *       <dt class="REGEX"><var>X</var><kbd>{</kbd><var>min</var><kbd>,</kbd><var>max</var><kbd>}?</kbd>
  299    *       <dd>Non-greedy matching.
  300    *     </dl>
  301    *   </li>
  302    *
  303    *   <li>Grouping, Capturing, and Back-reference
  304    *     <dl>
  305    *       <dt class="REGEX"><KBD>(?:</kbd><VAR>X</VAR><kbd>)</KBD>
  306    *       <dd>Grouping. "<KBD>foo+</KBD>" matches "<KBD>foo</KBD>" or "<KBD>foooo</KBD>".
  307    *       If you want it matches "<KBD>foofoo</KBD>" or "<KBD>foofoofoo</KBD>",
  308    *       you have to write "<KBD>(?:foo)+</KBD>".
  309    *
  310    *       <dt class="REGEX"><KBD>(</kbd><VAR>X</VAR><kbd>)</KBD>
  311    *       <dd>Grouping with capturing.
  312    * It make a group and applications can know
  313    * where in target text a group matched with methods of a <code>Match</code> instance
  314    * after <code><a href="#matches(java.lang.String, org.apache.xerces.utils.regex.Match)">matches(String,Match)</a></code>.
  315    * The 0th group means whole of this regular expression.
  316    * The <VAR>N</VAR>th gorup is the inside of the <VAR>N</VAR>th left parenthesis.
  317    * 
  318    *   <p>For instance, a regular expression is
  319    *   "<FONT color=blue><KBD> *([^&lt;:]*) +&lt;([^&gt;]*)&gt; *</KBD></FONT>"
  320    *   and target text is
  321    *   "<FONT color=red><KBD>From: TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>":
  322    *   <ul>
  323    *     <li><code>Match.getCapturedText(0)</code>:
  324    *     "<FONT color=red><KBD> TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;</KBD></FONT>"
  325    *     <li><code>Match.getCapturedText(1)</code>: "<FONT color=red><KBD>TAMURA Kent</KBD></FONT>"
  326    *     <li><code>Match.getCapturedText(2)</code>: "<FONT color=red><KBD>kent@trl.ibm.co.jp</KBD></FONT>"
  327    *   </ul>
  328    *
  329    *       <dt class="REGEX"><kbd>\1 \2 \3 \4 \5 \6 \7 \8 \9</kbd>
  330    *       <dd>
  331    *
  332    *       <dt class="REGEX"><kbd>(?></kbd><var>X</var><kbd>)</kbd>
  333    *       <dd>Independent expression group. ................
  334    *
  335    *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
  336    *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>:</kbd><var>X</var><kbd>)</kbd>
  337    *       <dd>............................
  338    *       <dd>The <var>options</var> or the <var>options2</var> consists of 'i' 'm' 's' 'w'.
  339    *           Note that it can not contain 'u'.
  340    *
  341    *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>)</kbd>
  342    *       <dt class="REGEX"><kbd>(?</kbd><var>options</var><kbd>-</kbd><var>options2</var><kbd>)</kbd>
  343    *       <dd>......
  344    *       <dd>These expressions must be at the beginning of a group.
  345    *     </dl>
  346    *   </li>
  347    *
  348    *   <li>Anchor
  349    *     <dl>
  350    *       <dt class="REGEX"><kbd>\A</kbd>
  351    *       <dd>Matches the beginnig of the text.
  352    *
  353    *       <dt class="REGEX"><kbd>\Z</kbd>
  354    *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
  355    *           or CARRIAGE RETURN + LINE FEED at the end of the text.
  356    *
  357    *       <dt class="REGEX"><kbd>\z</kbd>
  358    *       <dd>Matches the end of the text.
  359    *
  360    *       <dt class="REGEX"><kbd>^</kbd>
  361    *       <dd>Matches the beginning of the text.  It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
  362    *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
  363    *           it matches the beginning of the text, or after one of EOL characters (
  364    *           LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
  365    *           PARAGRAPH SEPARATOR (U+2029).)
  366    *
  367    *       <dt class="REGEX"><kbd>$</kbd>
  368    *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
  369    *           or CARRIAGE RETURN + LINE FEED at the end of the text.
  370    *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
  371    *           it matches the end of the text, or before an EOL character.
  372    *
  373    *       <dt class="REGEX"><kbd>\b</kbd>
  374    *       <dd>Matches word boundary.
  375    *           (See <a href="#W_OPTION">a "w" option</a>)
  376    *
  377    *       <dt class="REGEX"><kbd>\B</kbd>
  378    *       <dd>Matches non word boundary.
  379    *           (See <a href="#W_OPTION">a "w" option</a>)
  380    *
  381    *       <dt class="REGEX"><kbd>\&lt;</kbd>
  382    *       <dd>Matches the beginning of a word.
  383    *           (See <a href="#W_OPTION">a "w" option</a>)
  384    *
  385    *       <dt class="REGEX"><kbd>\&gt;</kbd>
  386    *       <dd>Matches the end of a word.
  387    *           (See <a href="#W_OPTION">a "w" option</a>)
  388    *     </dl>
  389    *   </li>
  390    *   <li>Lookahead and lookbehind
  391    *     <dl>
  392    *       <dt class="REGEX"><kbd>(?=</kbd><var>X</var><kbd>)</kbd>
  393    *       <dd>Lookahead.
  394    *
  395    *       <dt class="REGEX"><kbd>(?!</kbd><var>X</var><kbd>)</kbd>
  396    *       <dd>Negative lookahead.
  397    *
  398    *       <dt class="REGEX"><kbd>(?&lt;=</kbd><var>X</var><kbd>)</kbd>
  399    *       <dd>Lookbehind.
  400    *       <dd>(Note for text capturing......)
  401    *
  402    *       <dt class="REGEX"><kbd>(?&lt;!</kbd><var>X</var><kbd>)</kbd>
  403    *       <dd>Negative lookbehind.
  404    *     </dl>
  405    *   </li>
  406    *
  407    *   <li>Misc.
  408    *     <dl>
  409    *       <dt class="REGEX"><kbd>(?(</Kbd><var>condition</var><Kbd>)</kbd><var>yes-pattern</var><kbd>|</kbd><var>no-pattern</var><kbd>)</kbd>,
  410    *       <dt class="REGEX"><kbd>(?(</kbd><var>condition</var><kbd>)</kbd><var>yes-pattern</var><kbd>)</kbd>
  411    *       <dd>......
  412    *       <dt class="REGEX"><kbd>(?#</kbd><var>comment</var><kbd>)</kbd>
  413    *       <dd>Comment.  A comment string consists of characters except '<kbd>)</kbd>'.
  414    *           You can not write comments in character classes and before quantifiers.
  415    *     </dl>
  416    *   </li>
  417    * </ul>
  418    *
  419    *
  420    * <hr width="50%">
  421    * <h3>BNF for the regular expression</h3>
  422    * <pre>
  423    * regex ::= ('(?' options ')')? term ('|' term)*
  424    * term ::= factor+
  425    * factor ::= anchors | atom (('*' | '+' | '?' | minmax ) '?'? )?
  426    *            | '(?#' [^)]* ')'
  427    * minmax ::= '{' ([0-9]+ | [0-9]+ ',' | ',' [0-9]+ | [0-9]+ ',' [0-9]+) '}'
  428    * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
  429    *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '\X'
  430    *          | '(?>' regex ')' | '(?' options ':' regex ')'
  431    *          | '(?' ('(' [0-9] ')' | '(' anchors ')' | looks) term ('|' term)? ')'
  432    * options ::= [imsw]* ('-' [imsw]+)?
  433    * anchors ::= '^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\&lt;' | '\>'
  434    * looks ::= '(?=' regex ')'  | '(?!' regex ')'
  435    *           | '(?&lt;=' regex ')' | '(?&lt;!' regex ')'
  436    * char ::= '\\' | '\' [efnrtv] | '\c' [@-_] | code-point | character-1
  437    * category-block ::= '\' [pP] category-symbol-1
  438    *                    | ('\p{' | '\P{') (category-symbol | block-name
  439    *                                       | other-properties) '}'
  440    * category-symbol-1 ::= 'L' | 'M' | 'N' | 'Z' | 'C' | 'P' | 'S'
  441    * category-symbol ::= category-symbol-1 | 'Lu' | 'Ll' | 'Lt' | 'Lm' | Lo'
  442    *                     | 'Mn' | 'Me' | 'Mc' | 'Nd' | 'Nl' | 'No'
  443    *                     | 'Zs' | 'Zl' | 'Zp' | 'Cc' | 'Cf' | 'Cn' | 'Co' | 'Cs'
  444    *                     | 'Pd' | 'Ps' | 'Pe' | 'Pc' | 'Po'
  445    *                     | 'Sm' | 'Sc' | 'Sk' | 'So'
  446    * block-name ::= (See above)
  447    * other-properties ::= 'ALL' | 'ASSIGNED' | 'UNASSIGNED'
  448    * character-1 ::= (any character except meta-characters)
  449    *
  450    * char-class ::= '[' ranges ']'
  451    *                | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
  452    * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
  453    * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
  454    *           | range-char | range-char '-' range-char
  455    * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
  456    * code-point ::= '\x' hex-char hex-char
  457    *                | '\x{' hex-char+ '}'
  458    * <!--               | '\u005c u' hex-char hex-char hex-char hex-char
  459    * -->               | '\v' hex-char hex-char hex-char hex-char hex-char hex-char
  460    * hex-char ::= [0-9a-fA-F]
  461    * character-2 ::= (any character except \[]-,)
  462    * </pre>
  463    *
  464    * <hr width="50%">
  465    * <h3>TODO</h3>
  466    * <ul>
  467    *   <li><a href="http://www.unicode.org/unicode/reports/tr18/">Unicode Regular Expression Guidelines</a>
  468    *     <ul>
  469    *       <li>2.4 Canonical Equivalents
  470    *       <li>Level 3
  471    *     </ul>
  472    *   <li>Parsing performance
  473    * </ul>
  474    *
  475    * <hr width="50%">
  476    *
  477    * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
  478    * @version $Id: RegularExpression.java 111285 2004-12-08 16:54:26Z cezar $
  479    */
  480   public class RegularExpression implements java.io.Serializable {
  481       static final boolean DEBUG = false;
  482   
  483       /**
  484        * Compiles a token tree into an operation flow.
  485        */
  486       private synchronized void compile(Token tok) {
  487           if (this.operations != null)
  488               return;
  489           this.numberOfClosures = 0;
  490           this.operations = this.compile(tok, null, false);
  491       }
  492   
  493       /**
  494        * Converts a token to an operation.
  495        */
  496       private Op compile(Token tok, Op next, boolean reverse) {
  497           Op ret;
  498           switch (tok.type) {
  499           case Token.DOT:
  500               ret = Op.createDot();
  501               ret.next = next;
  502               break;
  503   
  504           case Token.CHAR:
  505               ret = Op.createChar(tok.getChar());
  506               ret.next = next;
  507               break;
  508   
  509           case Token.ANCHOR:
  510               ret = Op.createAnchor(tok.getChar());
  511               ret.next = next;
  512               break;
  513   
  514           case Token.RANGE:
  515           case Token.NRANGE:
  516               ret = Op.createRange(tok);
  517               ret.next = next;
  518               break;
  519   
  520           case Token.CONCAT:
  521               ret = next;
  522               if (!reverse) {
  523                   for (int i = tok.size()-1;  i >= 0;  i --) {
  524                       ret = compile(tok.getChild(i), ret, false);
  525                   }
  526               } else {
  527                   for (int i = 0;  i < tok.size();  i ++) {
  528                       ret = compile(tok.getChild(i), ret, true);
  529                   }
  530               }
  531               break;
  532   
  533           case Token.UNION:
  534               Op.UnionOp uni = Op.createUnion(tok.size());
  535               for (int i = 0;  i < tok.size();  i ++) {
  536                   uni.addElement(compile(tok.getChild(i), next, reverse));
  537               }
  538               ret = uni;                          // ret.next is null.
  539               break;
  540   
  541           case Token.CLOSURE:
  542           case Token.NONGREEDYCLOSURE:
  543               Token child = tok.getChild(0);
  544               int min = tok.getMin();
  545               int max = tok.getMax();
  546               if (min >= 0 && min == max) { // {n}
  547                   ret = next;
  548                   for (int i = 0; i < min;  i ++) {
  549                       ret = compile(child, ret, reverse);
  550                   }
  551                   break;
  552               }
  553               if (min > 0 && max > 0)
  554                   max -= min;
  555               if (max > 0) {
  556                   // X{2,6} -> XX(X(X(XX?)?)?)?
  557                   ret = next;
  558                   for (int i = 0;  i < max;  i ++) {
  559                       Op.ChildOp q = Op.createQuestion(tok.type == Token.NONGREEDYCLOSURE);
  560                       q.next = next;
  561                       q.setChild(compile(child, ret, reverse));
  562                       ret = q;
  563                   }
  564               } else {
  565                   Op.ChildOp op;
  566                   if (tok.type == Token.NONGREEDYCLOSURE) {
  567                       op = Op.createNonGreedyClosure();
  568                   } else {                        // Token.CLOSURE
  569                       if (child.getMinLength() == 0)
  570                           op = Op.createClosure(this.numberOfClosures++);
  571                       else
  572                           op = Op.createClosure(-1);
  573                   }
  574                   op.next = next;
  575                   op.setChild(compile(child, op, reverse));
  576                   ret = op;
  577               }
  578               if (min > 0) {
  579                   for (int i = 0;  i < min;  i ++) {
  580                       ret = compile(child, ret, reverse);
  581                   }
  582               }
  583               break;
  584   
  585           case Token.EMPTY:
  586               ret = next;
  587               break;
  588   
  589           case Token.STRING:
  590               ret = Op.createString(tok.getString());
  591               ret.next = next;
  592               break;
  593   
  594           case Token.BACKREFERENCE:
  595               ret = Op.createBackReference(tok.getReferenceNumber());
  596               ret.next = next;
  597               break;
  598   
  599           case Token.PAREN:
  600               if (tok.getParenNumber() == 0) {
  601                   ret = compile(tok.getChild(0), next, reverse);
  602               } else if (reverse) {
  603                   next = Op.createCapture(tok.getParenNumber(), next);
  604                   next = compile(tok.getChild(0), next, reverse);
  605                   ret = Op.createCapture(-tok.getParenNumber(), next);
  606               } else {
  607                   next = Op.createCapture(-tok.getParenNumber(), next);
  608                   next = compile(tok.getChild(0), next, reverse);
  609                   ret = Op.createCapture(tok.getParenNumber(), next);
  610               }
  611               break;
  612   
  613           case Token.LOOKAHEAD:
  614               ret = Op.createLook(Op.LOOKAHEAD, next, compile(tok.getChild(0), null, false));
  615               break;
  616           case Token.NEGATIVELOOKAHEAD:
  617               ret = Op.createLook(Op.NEGATIVELOOKAHEAD, next, compile(tok.getChild(0), null, false));
  618               break;
  619           case Token.LOOKBEHIND:
  620               ret = Op.createLook(Op.LOOKBEHIND, next, compile(tok.getChild(0), null, true));
  621               break;
  622           case Token.NEGATIVELOOKBEHIND:
  623               ret = Op.createLook(Op.NEGATIVELOOKBEHIND, next, compile(tok.getChild(0), null, true));
  624               break;
  625   
  626           case Token.INDEPENDENT:
  627               ret = Op.createIndependent(next, compile(tok.getChild(0), null, reverse));
  628               break;
  629   
  630           case Token.MODIFIERGROUP:
  631               ret = Op.createModifier(next, compile(tok.getChild(0), null, reverse),
  632                                       ((Token.ModifierToken)tok).getOptions(),
  633                                       ((Token.ModifierToken)tok).getOptionsMask());
  634               break;
  635   
  636           case Token.CONDITION:
  637               Token.ConditionToken ctok = (Token.ConditionToken)tok;
  638               int ref = ctok.refNumber;
  639               Op condition = ctok.condition == null ? null : compile(ctok.condition, null, reverse);
  640               Op yes = compile(ctok.yes, next, reverse);
  641               Op no = ctok.no == null ? null : compile(ctok.no, next, reverse);
  642               ret = Op.createCondition(next, ref, condition, yes, no);
  643               break;
  644   
  645           default:
  646               throw new RuntimeException("Unknown token type: "+tok.type);
  647           } // switch (tok.type)
  648           return ret;
  649       }
  650   
  651   
  652   //Public
  653   
  654       /**
  655        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  656        *
  657        * @return true if the target is matched to this regular expression.
  658        */
  659       public boolean matches(char[]  target) {
  660           return this.matches(target, 0,  target .length , (Match)null);
  661       }
  662   
  663       /**
  664        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
  665        * in specified range or not.
  666        *
  667        * @param start Start offset of the range.
  668        * @param end  End offset +1 of the range.
  669        * @return true if the target is matched to this regular expression.
  670        */
  671       public boolean matches(char[]  target, int start, int end) {
  672           return this.matches(target, start, end, (Match)null);
  673       }
  674   
  675       /**
  676        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  677        *
  678        * @param match A Match instance for storing matching result.
  679        * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  680        */
  681       public boolean matches(char[]  target, Match match) {
  682           return this.matches(target, 0,  target .length , match);
  683       }
  684   
  685   
  686       /**
  687        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
  688        * in specified range or not.
  689        *
  690        * @param start Start offset of the range.
  691        * @param end  End offset +1 of the range.
  692        * @param match A Match instance for storing matching result.
  693        * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  694        */
  695       public boolean matches(char[]  target, int start, int end, Match match) {
  696   
  697           synchronized (this) {
  698               if (this.operations == null)
  699                   this.prepare();
  700               if (this.context == null)
  701                   this.context = new Context();
  702           }
  703           Context con = null;
  704           synchronized (this.context) {
  705               con = this.context.inuse ? new Context() : this.context;
  706               con.reset(target, start, end, this.numberOfClosures);
  707           }
  708           if (match != null) {
  709               match.setNumberOfGroups(this.nofparen);
  710               match.setSource(target);
  711           } else if (this.hasBackReferences) {
  712               match = new Match();
  713               match.setNumberOfGroups(this.nofparen);
  714               // Need not to call setSource() because
  715               // a caller can not access this match instance.
  716           }
  717           con.match = match;
  718   
  719           if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
  720               int matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
  721               //System.err.println("DEBUG: matchEnd="+matchEnd);
  722               if (matchEnd == con.limit) {
  723                   if (con.match != null) {
  724                       con.match.setBeginning(0, con.start);
  725                       con.match.setEnd(0, matchEnd);
  726                   }
  727                   con.inuse = false;
  728                   return true;
  729               }
  730               return false;
  731           }
  732   
  733           /*
  734            * The pattern has only fixed string.
  735            * The engine uses Boyer-Moore.
  736            */
  737           if (this.fixedStringOnly) {
  738               //System.err.println("DEBUG: fixed-only: "+this.fixedString);
  739               int o = this.fixedStringTable.matches(target, con.start, con.limit);
  740               if (o >= 0) {
  741                   if (con.match != null) {
  742                       con.match.setBeginning(0, o);
  743                       con.match.setEnd(0, o+this.fixedString.length());
  744                   }
  745                   con.inuse = false;
  746                   return true;
  747               }
  748               con.inuse = false;
  749               return false;
  750           }
  751   
  752           /*
  753            * The pattern contains a fixed string.
  754            * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
  755            * If not, it return with false.
  756            */
  757           if (this.fixedString != null) {
  758               int o = this.fixedStringTable.matches(target, con.start, con.limit);
  759               if (o < 0) {
  760                   //System.err.println("Non-match in fixed-string search.");
  761                   con.inuse = false;
  762                   return false;
  763               }
  764           }
  765   
  766           int limit = con.limit-this.minlength;
  767           int matchStart;
  768           int matchEnd = -1;
  769   
  770           /*
  771            * Checks whether the expression starts with ".*".
  772            */
  773           if (this.operations != null
  774               && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
  775               if (isSet(this.options, SINGLE_LINE)) {
  776                   matchStart = con.start;
  777                   matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
  778               } else {
  779                   boolean previousIsEOL = true;
  780                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
  781                       int ch =  target [  matchStart ] ;
  782                       if (isEOLChar(ch)) {
  783                           previousIsEOL = true;
  784                       } else {
  785                           if (previousIsEOL) {
  786                               if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
  787                                                                          matchStart, 1, this.options)))
  788                                   break;
  789                           }
  790                           previousIsEOL = false;
  791                       }
  792                   }
  793               }
  794           }
  795   
  796           /*
  797            * Optimization against the first character.
  798            */
  799           else if (this.firstChar != null) {
  800               //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
  801               RangeToken range = this.firstChar;
  802               if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
  803                   range = this.firstChar.getCaseInsensitiveToken();
  804                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
  805                       int ch =  target [  matchStart ] ;
  806                       if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
  807                           ch = REUtil.composeFromSurrogates(ch,  target [  matchStart+1 ] );
  808                           if (!range.match(ch))  continue;
  809                       } else {
  810                           if (!range.match(ch)) {
  811                               char ch1 = Character.toUpperCase((char)ch);
  812                               if (!range.match(ch1))
  813                                   if (!range.match(Character.toLowerCase(ch1)))
  814                                       continue;
  815                           }
  816                       }
  817                       if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
  818                                                                  matchStart, 1, this.options)))
  819                           break;
  820                   }
  821               } else {
  822                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
  823                       int ch =  target [  matchStart ] ;
  824                       if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
  825                           ch = REUtil.composeFromSurrogates(ch,  target [  matchStart+1 ] );
  826                       if (!range.match(ch))  continue;
  827                       if (0 <= (matchEnd = this. matchCharArray (con, this.operations,
  828                                                                  matchStart, 1, this.options)))
  829                           break;
  830                   }
  831               }
  832           }
  833   
  834           /*
  835            * Straightforward matching.
  836            */
  837           else {
  838               for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
  839                   if (0 <= (matchEnd = this. matchCharArray (con, this.operations, matchStart, 1, this.options)))
  840                       break;
  841               }
  842           }
  843   
  844           if (matchEnd >= 0) {
  845               if (con.match != null) {
  846                   con.match.setBeginning(0, matchStart);
  847                   con.match.setEnd(0, matchEnd);
  848               }
  849               con.inuse = false;
  850               return true;
  851           } else {
  852               con.inuse = false;
  853               return false;
  854           }
  855       }
  856   
  857   /**
  858    * @return -1 when not match; offset of the end of matched string when match.
  859    */
  860       private int matchCharArray (Context con, Op op, int offset, int dx, int opts) {
  861   
  862           char[] target = con.charTarget;
  863   
  864   
  865           while (true) {
  866               if (op == null)
  867                   return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
  868               if (offset > con.limit || offset < con.start)
  869                   return -1;
  870               switch (op.type) {
  871               case Op.CHAR:
  872                   if (isSet(opts, IGNORE_CASE)) {
  873                       int ch = op.getData();
  874                       if (dx > 0) {
  875                           if (offset >= con.limit || !matchIgnoreCase(ch,  target [  offset ] ))
  876                               return -1;
  877                           offset ++;
  878                       } else {
  879                           int o1 = offset-1;
  880                           if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch,  target [  o1 ] ))
  881                               return -1;
  882                           offset = o1;
  883                       }
  884                   } else {
  885                       int ch = op.getData();
  886                       if (dx > 0) {
  887                           if (offset >= con.limit || ch !=  target [  offset ] )
  888                               return -1;
  889                           offset ++;
  890                       } else {
  891                           int o1 = offset-1;
  892                           if (o1 >= con.limit || o1 < 0 || ch !=  target [  o1 ] )
  893                               return -1;
  894                           offset = o1;
  895                       }
  896                   }
  897                   op = op.next;
  898                   break;
  899   
  900               case Op.DOT:
  901                   if (dx > 0) {
  902                       if (offset >= con.limit)
  903                           return -1;
  904                       int ch =  target [  offset ] ;
  905                       if (isSet(opts, SINGLE_LINE)) {
  906                           if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  907                               offset ++;
  908                       } else {
  909                           if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  910                               ch = REUtil.composeFromSurrogates(ch,  target [  ++offset ] );
  911                           if (isEOLChar(ch))
  912                               return -1;
  913                       }
  914                       offset ++;
  915                   } else {
  916                       int o1 = offset-1;
  917                       if (o1 >= con.limit || o1 < 0)
  918                           return -1;
  919                       int ch =  target [  o1 ] ;
  920                       if (isSet(opts, SINGLE_LINE)) {
  921                           if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  922                               o1 --;
  923                       } else {
  924                           if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  925                               ch = REUtil.composeFromSurrogates( target [  --o1 ] , ch);
  926                           if (!isEOLChar(ch))
  927                               return -1;
  928                       }
  929                       offset = o1;
  930                   }
  931                   op = op.next;
  932                   break;
  933   
  934               case Op.RANGE:
  935               case Op.NRANGE:
  936                   if (dx > 0) {
  937                       if (offset >= con.limit)
  938                           return -1;
  939                       int ch =  target [  offset ] ;
  940                       if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
  941                           ch = REUtil.composeFromSurrogates(ch,  target [  ++offset ] );
  942                       RangeToken tok = op.getToken();
  943                       if (isSet(opts, IGNORE_CASE)) {
  944                           tok = tok.getCaseInsensitiveToken();
  945                           if (!tok.match(ch)) {
  946                               if (ch >= 0x10000)  return -1;
  947                               char uch;
  948                               if (!tok.match(uch = Character.toUpperCase((char)ch))
  949                                   && !tok.match(Character.toLowerCase(uch)))
  950                                   return -1;
  951                           }
  952                       } else {
  953                           if (!tok.match(ch))  return -1;
  954                       }
  955                       offset ++;
  956                   } else {
  957                       int o1 = offset-1;
  958                       if (o1 >= con.limit || o1 < 0)
  959                           return -1;
  960                       int ch =  target [  o1 ] ;
  961                       if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
  962                           ch = REUtil.composeFromSurrogates( target [  --o1 ] , ch);
  963                       RangeToken tok = op.getToken();
  964                       if (isSet(opts, IGNORE_CASE)) {
  965                           tok = tok.getCaseInsensitiveToken();
  966                           if (!tok.match(ch)) {
  967                               if (ch >= 0x10000)  return -1;
  968                               char uch;
  969                               if (!tok.match(uch = Character.toUpperCase((char)ch))
  970                                   && !tok.match(Character.toLowerCase(uch)))
  971                                   return -1;
  972                           }
  973                       } else {
  974                           if (!tok.match(ch))  return -1;
  975                       }
  976                       offset = o1;
  977                   }
  978                   op = op.next;
  979                   break;
  980   
  981               case Op.ANCHOR:
  982                   boolean go = false;
  983                   switch (op.getData()) {
  984                   case '^':
  985                       if (isSet(opts, MULTIPLE_LINES)) {
  986                           if (!(offset == con.start
  987                                 || offset > con.start && isEOLChar( target [  offset-1 ] )))
  988                               return -1;
  989                       } else {
  990                           if (offset != con.start)
  991                               return -1;
  992                       }
  993                       break;
  994   
  995                   case '@':                         // Internal use only.
  996                       // The @ always matches line beginnings.
  997                       if (!(offset == con.start
  998                             || offset > con.start && isEOLChar( target [  offset-1 ] )))
  999                           return -1;
 1000                       break;
 1001   
 1002                   case '$':
 1003                       if (isSet(opts, MULTIPLE_LINES)) {
 1004                           if (!(offset == con.limit
 1005                                 || offset < con.limit && isEOLChar( target [  offset ] )))
 1006                               return -1;
 1007                       } else {
 1008                           if (!(offset == con.limit
 1009                                 || offset+1 == con.limit && isEOLChar( target [  offset ] )
 1010                                 || offset+2 == con.limit &&  target [  offset ]  == CARRIAGE_RETURN
 1011                                 &&  target [  offset+1 ]  == LINE_FEED))
 1012                               return -1;
 1013                       }
 1014                       break;
 1015   
 1016                   case 'A':
 1017                       if (offset != con.start)  return -1;
 1018                       break;
 1019   
 1020                   case 'Z':
 1021                       if (!(offset == con.limit
 1022                             || offset+1 == con.limit && isEOLChar( target [  offset ] )
 1023                             || offset+2 == con.limit &&  target [  offset ]  == CARRIAGE_RETURN
 1024                             &&  target [  offset+1 ]  == LINE_FEED))
 1025                           return -1;
 1026                       break;
 1027   
 1028                   case 'z':
 1029                       if (offset != con.limit)  return -1;
 1030                       break;
 1031   
 1032                   case 'b':
 1033                       if (con.length == 0)  return -1;
 1034                       {
 1035                           int after = getWordType(target, con.start, con.limit, offset, opts);
 1036                           if (after == WT_IGNORE)  return -1;
 1037                           int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
 1038                           if (after == before)  return -1;
 1039                       }
 1040                       break;
 1041   
 1042                   case 'B':
 1043                       if (con.length == 0)
 1044                           go = true;
 1045                       else {
 1046                           int after = getWordType(target, con.start, con.limit, offset, opts);
 1047                           go = after == WT_IGNORE
 1048                                || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
 1049                       }
 1050                       if (!go)  return -1;
 1051                       break;
 1052   
 1053                   case '<':
 1054                       if (con.length == 0 || offset == con.limit)  return -1;
 1055                       if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
 1056                           || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
 1057                           return -1;
 1058                       break;
 1059   
 1060                   case '>':
 1061                       if (con.length == 0 || offset == con.start)  return -1;
 1062                       if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
 1063                           || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
 1064                           return -1;
 1065                       break;
 1066                   } // switch anchor type
 1067                   op = op.next;
 1068                   break;
 1069   
 1070               case Op.BACKREFERENCE:
 1071                   {
 1072                       int refno = op.getData();
 1073                       if (refno <= 0 || refno >= this.nofparen)
 1074                           throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
 1075                       if (con.match.getBeginning(refno) < 0
 1076                           || con.match.getEnd(refno) < 0)
 1077                           return -1;                // ********
 1078                       int o2 = con.match.getBeginning(refno);
 1079                       int literallen = con.match.getEnd(refno)-o2;
 1080                       if (!isSet(opts, IGNORE_CASE)) {
 1081                           if (dx > 0) {
 1082                               if (!regionMatches(target, offset, con.limit, o2, literallen))
 1083                                   return -1;
 1084                               offset += literallen;
 1085                           } else {
 1086                               if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
 1087                                   return -1;
 1088                               offset -= literallen;
 1089                           }
 1090                       } else {
 1091                           if (dx > 0) {
 1092                               if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
 1093                                   return -1;
 1094                               offset += literallen;
 1095                           } else {
 1096                               if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
 1097                                                            o2, literallen))
 1098                                   return -1;
 1099                               offset -= literallen;
 1100                           }
 1101                       }
 1102                   }
 1103                   op = op.next;
 1104                   break;
 1105               case Op.STRING:
 1106                   {
 1107                       String literal = op.getString();
 1108                       int literallen = literal.length();
 1109                       if (!isSet(opts, IGNORE_CASE)) {
 1110                           if (dx > 0) {
 1111                               if (!regionMatches(target, offset, con.limit, literal, literallen))
 1112                                   return -1;
 1113                               offset += literallen;
 1114                           } else {
 1115                               if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
 1116                                   return -1;
 1117                               offset -= literallen;
 1118                           }
 1119                       } else {
 1120                           if (dx > 0) {
 1121                               if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
 1122                                   return -1;
 1123                               offset += literallen;
 1124                           } else {
 1125                               if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
 1126                                                            literal, literallen))
 1127                                   return -1;
 1128                               offset -= literallen;
 1129                           }
 1130                       }
 1131                   }
 1132                   op = op.next;
 1133                   break;
 1134   
 1135               case Op.CLOSURE:
 1136                   {
 1137                       /*
 1138                        * Saves current position to avoid
 1139                        * zero-width repeats.
 1140                        */
 1141                       int id = op.getData();
 1142                       if (id >= 0) {
 1143                           int previousOffset = con.offsets[id];
 1144                           if (previousOffset < 0 || previousOffset != offset) {
 1145                               con.offsets[id] = offset;
 1146                           } else {
 1147                               con.offsets[id] = -1;
 1148                               op = op.next;
 1149                               break;
 1150                           }
 1151                       }
 1152   
 1153                       int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
 1154                       if (id >= 0)  con.offsets[id] = -1;
 1155                       if (ret >= 0)  return ret;
 1156                       op = op.next;
 1157                   }
 1158                   break;
 1159   
 1160               case Op.QUESTION:
 1161                   {
 1162                       int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
 1163                       if (ret >= 0)  return ret;
 1164                       op = op.next;
 1165                   }
 1166                   break;
 1167   
 1168               case Op.NONGREEDYCLOSURE:
 1169               case Op.NONGREEDYQUESTION:
 1170                   {
 1171                       int ret = this. matchCharArray (con, op.next, offset, dx, opts);
 1172                       if (ret >= 0)  return ret;
 1173                       op = op.getChild();
 1174                   }
 1175                   break;
 1176   
 1177               case Op.UNION:
 1178                   for (int i = 0;  i < op.size();  i ++) {
 1179                       int ret = this. matchCharArray (con, op.elementAt(i), offset, dx, opts);
 1180                       if (DEBUG) {
 1181                           System.err.println("UNION: "+i+", ret="+ret);
 1182                       }
 1183                       if (ret >= 0)  return ret;
 1184                   }
 1185                   return -1;
 1186   
 1187               case Op.CAPTURE:
 1188                   int refno = op.getData();
 1189                   if (con.match != null && refno > 0) {
 1190                       int save = con.match.getBeginning(refno);
 1191                       con.match.setBeginning(refno, offset);
 1192                       int ret = this. matchCharArray (con, op.next, offset, dx, opts);
 1193                       if (ret < 0)  con.match.setBeginning(refno, save);
 1194                       return ret;
 1195                   } else if (con.match != null && refno < 0) {
 1196                       int index = -refno;
 1197                       int save = con.match.getEnd(index);
 1198                       con.match.setEnd(index, offset);
 1199                       int ret = this. matchCharArray (con, op.next, offset, dx, opts);
 1200                       if (ret < 0)  con.match.setEnd(index, save);
 1201                       return ret;
 1202                   }
 1203                   op = op.next;
 1204                   break;
 1205   
 1206               case Op.LOOKAHEAD:
 1207                   if (0 > this. matchCharArray (con, op.getChild(), offset, 1, opts))  return -1;
 1208                   op = op.next;
 1209                   break;
 1210               case Op.NEGATIVELOOKAHEAD:
 1211                   if (0 <= this. matchCharArray (con, op.getChild(), offset, 1, opts))  return -1;
 1212                   op = op.next;
 1213                   break;
 1214               case Op.LOOKBEHIND:
 1215                   if (0 > this. matchCharArray (con, op.getChild(), offset, -1, opts))  return -1;
 1216                   op = op.next;
 1217                   break;
 1218               case Op.NEGATIVELOOKBEHIND:
 1219                   if (0 <= this. matchCharArray (con, op.getChild(), offset, -1, opts))  return -1;
 1220                   op = op.next;
 1221                   break;
 1222   
 1223               case Op.INDEPENDENT:
 1224                   {
 1225                       int ret = this. matchCharArray (con, op.getChild(), offset, dx, opts);
 1226                       if (ret < 0)  return ret;
 1227                       offset = ret;
 1228                       op = op.next;
 1229                   }
 1230                   break;
 1231   
 1232               case Op.MODIFIER:
 1233                   {
 1234                       int localopts = opts;
 1235                       localopts |= op.getData();
 1236                       localopts &= ~op.getData2();
 1237                       //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
 1238                       int ret = this. matchCharArray (con, op.getChild(), offset, dx, localopts);
 1239                       if (ret < 0)  return ret;
 1240                       offset = ret;
 1241                       op = op.next;
 1242                   }
 1243                   break;
 1244   
 1245               case Op.CONDITION:
 1246                   {
 1247                       Op.ConditionOp cop = (Op.ConditionOp)op;
 1248                       boolean matchp = false;
 1249                       if (cop.refNumber > 0) {
 1250                           if (cop.refNumber >= this.nofparen)
 1251                               throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
 1252                           matchp = con.match.getBeginning(cop.refNumber) >= 0
 1253                                    && con.match.getEnd(cop.refNumber) >= 0;
 1254                       } else {
 1255                           matchp = 0 <= this. matchCharArray (con, cop.condition, offset, dx, opts);
 1256                       }
 1257   
 1258                       if (matchp) {
 1259                           op = cop.yes;
 1260                       } else if (cop.no != null) {
 1261                           op = cop.no;
 1262                       } else {
 1263                           op = cop.next;
 1264                       }
 1265                   }
 1266                   break;
 1267   
 1268               default:
 1269                   throw new RuntimeException("Unknown operation type: "+op.type);
 1270               } // switch (op.type)
 1271           } // while
 1272       }
 1273   
 1274       private static final int getPreviousWordType(char[]  target, int begin, int end,
 1275                                                    int offset, int opts) {
 1276           int ret = getWordType(target, begin, end, --offset, opts);
 1277           while (ret == WT_IGNORE)
 1278               ret = getWordType(target, begin, end, --offset, opts);
 1279           return ret;
 1280       }
 1281   
 1282       private static final int getWordType(char[]  target, int begin, int end,
 1283                                            int offset, int opts) {
 1284           if (offset < begin || offset >= end)  return WT_OTHER;
 1285           return getWordType0( target [  offset ] , opts);
 1286       }
 1287   
 1288   
 1289   
 1290       private static final boolean regionMatches(char[]  target, int offset, int limit,
 1291                                                  String part, int partlen) {
 1292           if (offset < 0)  return false;
 1293           if (limit-offset < partlen)
 1294               return false;
 1295           int i = 0;
 1296           while (partlen-- > 0) {
 1297               if ( target [  offset++ ]  != part.charAt(i++))
 1298                   return false;
 1299           }
 1300           return true;
 1301       }
 1302   
 1303       private static final boolean regionMatches(char[]  target, int offset, int limit,
 1304                                                  int offset2, int partlen) {
 1305           if (offset < 0)  return false;
 1306           if (limit-offset < partlen)
 1307               return false;
 1308           int i = offset2;
 1309           while (partlen-- > 0) {
 1310               if ( target [  offset++ ]  !=  target [  i++ ] )
 1311                   return false;
 1312           }
 1313           return true;
 1314       }
 1315   
 1316   /**
 1317    * @see java.lang.String#regionMatches
 1318    */
 1319       private static final boolean regionMatchesIgnoreCase(char[]  target, int offset, int limit,
 1320                                                            String part, int partlen) {
 1321           if (offset < 0)  return false;
 1322           if (limit-offset < partlen)
 1323               return false;
 1324           int i = 0;
 1325           while (partlen-- > 0) {
 1326               char ch1 =  target [  offset++ ] ;
 1327               char ch2 = part.charAt(i++);
 1328               if (ch1 == ch2)
 1329                   continue;
 1330               char uch1 = Character.toUpperCase(ch1);
 1331               char uch2 = Character.toUpperCase(ch2);
 1332               if (uch1 == uch2)
 1333                   continue;
 1334               if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
 1335                   return false;
 1336           }
 1337           return true;
 1338       }
 1339   
 1340       private static final boolean regionMatchesIgnoreCase(char[]  target, int offset, int limit,
 1341                                                            int offset2, int partlen) {
 1342           if (offset < 0)  return false;
 1343           if (limit-offset < partlen)
 1344               return false;
 1345           int i = offset2;
 1346           while (partlen-- > 0) {
 1347               char ch1 =  target [  offset++ ] ;
 1348               char ch2 =  target [  i++ ] ;
 1349               if (ch1 == ch2)
 1350                   continue;
 1351               char uch1 = Character.toUpperCase(ch1);
 1352               char uch2 = Character.toUpperCase(ch2);
 1353               if (uch1 == uch2)
 1354                   continue;
 1355               if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
 1356                   return false;
 1357           }
 1358           return true;
 1359       }
 1360   
 1361   
 1362   
 1363   
 1364       /**
 1365        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 1366        *
 1367        * @return true if the target is matched to this regular expression.
 1368        */
 1369       public boolean matches(String  target) {
 1370           return this.matches(target, 0,  target .length() , (Match)null);
 1371       }
 1372   
 1373       /**
 1374        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 1375        * in specified range or not.
 1376        *
 1377        * @param start Start offset of the range.
 1378        * @param end  End offset +1 of the range.
 1379        * @return true if the target is matched to this regular expression.
 1380        */
 1381       public boolean matches(String  target, int start, int end) {
 1382           return this.matches(target, start, end, (Match)null);
 1383       }
 1384   
 1385       /**
 1386        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 1387        *
 1388        * @param match A Match instance for storing matching result.
 1389        * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 1390        */
 1391       public boolean matches(String  target, Match match) {
 1392           return this.matches(target, 0,  target .length() , match);
 1393       }
 1394   
 1395       /**
 1396        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
 1397        * in specified range or not.
 1398        *
 1399        * @param start Start offset of the range.
 1400        * @param end  End offset +1 of the range.
 1401        * @param match A Match instance for storing matching result.
 1402        * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 1403        */
 1404       public boolean matches(String  target, int start, int end, Match match) {
 1405   
 1406           synchronized (this) {
 1407               if (this.operations == null)
 1408                   this.prepare();
 1409               if (this.context == null)
 1410                   this.context = new Context();
 1411           }
 1412           Context con = null;
 1413           synchronized (this.context) {
 1414               con = this.context.inuse ? new Context() : this.context;
 1415               con.reset(target, start, end, this.numberOfClosures);
 1416           }
 1417           if (match != null) {
 1418               match.setNumberOfGroups(this.nofparen);
 1419               match.setSource(target);
 1420           } else if (this.hasBackReferences) {
 1421               match = new Match();
 1422               match.setNumberOfGroups(this.nofparen);
 1423               // Need not to call setSource() because
 1424               // a caller can not access this match instance.
 1425           }
 1426           con.match = match;
 1427   
 1428           if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
 1429               if (DEBUG) {
 1430                   System.err.println("target string="+target);
 1431               }
 1432               int matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
 1433               if (DEBUG) {
 1434                   System.err.println("matchEnd="+matchEnd);
 1435                   System.err.println("con.limit="+con.limit);
 1436               }
 1437               if (matchEnd == con.limit) {
 1438                   if (con.match != null) {
 1439                       con.match.setBeginning(0, con.start);
 1440                       con.match.setEnd(0, matchEnd);
 1441                   }
 1442                   con.inuse = false;
 1443                   return true;
 1444               }
 1445               return false;
 1446           }
 1447   
 1448           /*
 1449            * The pattern has only fixed string.
 1450            * The engine uses Boyer-Moore.
 1451            */
 1452           if (this.fixedStringOnly) {
 1453               //System.err.println("DEBUG: fixed-only: "+this.fixedString);
 1454               int o = this.fixedStringTable.matches(target, con.start, con.limit);
 1455               if (o >= 0) {
 1456                   if (con.match != null) {
 1457                       con.match.setBeginning(0, o);
 1458                       con.match.setEnd(0, o+this.fixedString.length());
 1459                   }
 1460                   con.inuse = false;
 1461                   return true;
 1462               }
 1463               con.inuse = false;
 1464               return false;
 1465           }
 1466   
 1467           /*
 1468            * The pattern contains a fixed string.
 1469            * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
 1470            * If not, it return with false.
 1471            */
 1472           if (this.fixedString != null) {
 1473               int o = this.fixedStringTable.matches(target, con.start, con.limit);
 1474               if (o < 0) {
 1475                   //System.err.println("Non-match in fixed-string search.");
 1476                   con.inuse = false;
 1477                   return false;
 1478               }
 1479           }
 1480   
 1481           int limit = con.limit-this.minlength;
 1482           int matchStart;
 1483           int matchEnd = -1;
 1484   
 1485           /*
 1486            * Checks whether the expression starts with ".*".
 1487            */
 1488           if (this.operations != null
 1489               && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
 1490               if (isSet(this.options, SINGLE_LINE)) {
 1491                   matchStart = con.start;
 1492                   matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
 1493               } else {
 1494                   boolean previousIsEOL = true;
 1495                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 1496                       int ch =  target .charAt(  matchStart ) ;
 1497                       if (isEOLChar(ch)) {
 1498                           previousIsEOL = true;
 1499                       } else {
 1500                           if (previousIsEOL) {
 1501                               if (0 <= (matchEnd = this. matchString (con, this.operations,
 1502                                                                       matchStart, 1, this.options)))
 1503                                   break;
 1504                           }
 1505                           previousIsEOL = false;
 1506                       }
 1507                   }
 1508               }
 1509           }
 1510   
 1511           /*
 1512            * Optimization against the first character.
 1513            */
 1514           else if (this.firstChar != null) {
 1515               //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
 1516               RangeToken range = this.firstChar;
 1517               if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
 1518                   range = this.firstChar.getCaseInsensitiveToken();
 1519                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 1520                       int ch =  target .charAt(  matchStart ) ;
 1521                       if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
 1522                           ch = REUtil.composeFromSurrogates(ch,  target .charAt(  matchStart+1 ) );
 1523                           if (!range.match(ch))  continue;
 1524                       } else {
 1525                           if (!range.match(ch)) {
 1526                               char ch1 = Character.toUpperCase((char)ch);
 1527                               if (!range.match(ch1))
 1528                                   if (!range.match(Character.toLowerCase(ch1)))
 1529                                       continue;
 1530                           }
 1531                       }
 1532                       if (0 <= (matchEnd = this. matchString (con, this.operations,
 1533                                                               matchStart, 1, this.options)))
 1534                           break;
 1535                   }
 1536               } else {
 1537                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 1538                       int ch =  target .charAt(  matchStart ) ;
 1539                       if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
 1540                           ch = REUtil.composeFromSurrogates(ch,  target .charAt(  matchStart+1 ) );
 1541                       if (!range.match(ch))  continue;
 1542                       if (0 <= (matchEnd = this. matchString (con, this.operations,
 1543                                                               matchStart, 1, this.options)))
 1544                           break;
 1545                   }
 1546               }
 1547           }
 1548   
 1549           /*
 1550            * Straightforward matching.
 1551            */
 1552           else {
 1553               for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 1554                   if (0 <= (matchEnd = this. matchString (con, this.operations, matchStart, 1, this.options)))
 1555                       break;
 1556               }
 1557           }
 1558   
 1559           if (matchEnd >= 0) {
 1560               if (con.match != null) {
 1561                   con.match.setBeginning(0, matchStart);
 1562                   con.match.setEnd(0, matchEnd);
 1563               }
 1564               con.inuse = false;
 1565               return true;
 1566           } else {
 1567               con.inuse = false;
 1568               return false;
 1569           }
 1570       }
 1571   
 1572       /**
 1573        * @return -1 when not match; offset of the end of matched string when match.
 1574        */
 1575       private int matchString (Context con, Op op, int offset, int dx, int opts) {
 1576   
 1577   
 1578   
 1579   
 1580           String target = con.strTarget;
 1581   
 1582   
 1583   
 1584   
 1585           while (true) {
 1586               if (op == null)
 1587                   return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
 1588               if (offset > con.limit || offset < con.start)
 1589                   return -1;
 1590               switch (op.type) {
 1591               case Op.CHAR:
 1592                   if (isSet(opts, IGNORE_CASE)) {
 1593                       int ch = op.getData();
 1594                       if (dx > 0) {
 1595                           if (offset >= con.limit || !matchIgnoreCase(ch,  target .charAt(  offset ) ))
 1596                               return -1;
 1597                           offset ++;
 1598                       } else {
 1599                           int o1 = offset-1;
 1600                           if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch,  target .charAt(  o1 ) ))
 1601                               return -1;
 1602                           offset = o1;
 1603                       }
 1604                   } else {
 1605                       int ch = op.getData();
 1606                       if (dx > 0) {
 1607                           if (offset >= con.limit || ch !=  target .charAt(  offset ) )
 1608                               return -1;
 1609                           offset ++;
 1610                       } else {
 1611                           int o1 = offset-1;
 1612                           if (o1 >= con.limit || o1 < 0 || ch !=  target .charAt(  o1 ) )
 1613                               return -1;
 1614                           offset = o1;
 1615                       }
 1616                   }
 1617                   op = op.next;
 1618                   break;
 1619   
 1620               case Op.DOT:
 1621                   if (dx > 0) {
 1622                       if (offset >= con.limit)
 1623                           return -1;
 1624                       int ch =  target .charAt(  offset ) ;
 1625                       if (isSet(opts, SINGLE_LINE)) {
 1626                           if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
 1627                               offset ++;
 1628                       } else {
 1629                           if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
 1630                               ch = REUtil.composeFromSurrogates(ch,  target .charAt(  ++offset ) );
 1631                           if (isEOLChar(ch))
 1632                               return -1;
 1633                       }
 1634                       offset ++;
 1635                   } else {
 1636                       int o1 = offset-1;
 1637                       if (o1 >= con.limit || o1 < 0)
 1638                           return -1;
 1639                       int ch =  target .charAt(  o1 ) ;
 1640                       if (isSet(opts, SINGLE_LINE)) {
 1641                           if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
 1642                               o1 --;
 1643                       } else {
 1644                           if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
 1645                               ch = REUtil.composeFromSurrogates( target .charAt(  --o1 ) , ch);
 1646                           if (!isEOLChar(ch))
 1647                               return -1;
 1648                       }
 1649                       offset = o1;
 1650                   }
 1651                   op = op.next;
 1652                   break;
 1653   
 1654               case Op.RANGE:
 1655               case Op.NRANGE:
 1656                   if (dx > 0) {
 1657                       if (offset >= con.limit)
 1658                           return -1;
 1659                       int ch =  target .charAt(  offset ) ;
 1660                       if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
 1661                           ch = REUtil.composeFromSurrogates(ch,  target .charAt(  ++offset ) );
 1662                       RangeToken tok = op.getToken();
 1663                       if (isSet(opts, IGNORE_CASE)) {
 1664                           tok = tok.getCaseInsensitiveToken();
 1665                           if (!tok.match(ch)) {
 1666                               if (ch >= 0x10000)  return -1;
 1667                               char uch;
 1668                               if (!tok.match(uch = Character.toUpperCase((char)ch))
 1669                                   && !tok.match(Character.toLowerCase(uch)))
 1670                                   return -1;
 1671                           }
 1672                       } else {
 1673                           if (!tok.match(ch))  return -1;
 1674                       }
 1675                       offset ++;
 1676                   } else {
 1677                       int o1 = offset-1;
 1678                       if (o1 >= con.limit || o1 < 0)
 1679                           return -1;
 1680                       int ch =  target .charAt(  o1 ) ;
 1681                       if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
 1682                           ch = REUtil.composeFromSurrogates( target .charAt(  --o1 ) , ch);
 1683                       RangeToken tok = op.getToken();
 1684                       if (isSet(opts, IGNORE_CASE)) {
 1685                           tok = tok.getCaseInsensitiveToken();
 1686                           if (!tok.match(ch)) {
 1687                               if (ch >= 0x10000)  return -1;
 1688                               char uch;
 1689                               if (!tok.match(uch = Character.toUpperCase((char)ch))
 1690                                   && !tok.match(Character.toLowerCase(uch)))
 1691                                   return -1;
 1692                           }
 1693                       } else {
 1694                           if (!tok.match(ch))  return -1;
 1695                       }
 1696                       offset = o1;
 1697                   }
 1698                   op = op.next;
 1699                   break;
 1700   
 1701               case Op.ANCHOR:
 1702                   boolean go = false;
 1703                   switch (op.getData()) {
 1704                   case '^':
 1705                       if (isSet(opts, MULTIPLE_LINES)) {
 1706                           if (!(offset == con.start
 1707                                 || offset > con.start && isEOLChar( target .charAt(  offset-1 ) )))
 1708                               return -1;
 1709                       } else {
 1710                           if (offset != con.start)
 1711                               return -1;
 1712                       }
 1713                       break;
 1714   
 1715                   case '@':                         // Internal use only.
 1716                       // The @ always matches line beginnings.
 1717                       if (!(offset == con.start
 1718                             || offset > con.start && isEOLChar( target .charAt(  offset-1 ) )))
 1719                           return -1;
 1720                       break;
 1721   
 1722                   case '$':
 1723                       if (isSet(opts, MULTIPLE_LINES)) {
 1724                           if (!(offset == con.limit
 1725                                 || offset < con.limit && isEOLChar( target .charAt(  offset ) )))
 1726                               return -1;
 1727                       } else {
 1728                           if (!(offset == con.limit
 1729                                 || offset+1 == con.limit && isEOLChar( target .charAt(  offset ) )
 1730                                 || offset+2 == con.limit &&  target .charAt(  offset )  == CARRIAGE_RETURN
 1731                                 &&  target .charAt(  offset+1 )  == LINE_FEED))
 1732                               return -1;
 1733                       }
 1734                       break;
 1735   
 1736                   case 'A':
 1737                       if (offset != con.start)  return -1;
 1738                       break;
 1739   
 1740                   case 'Z':
 1741                       if (!(offset == con.limit
 1742                             || offset+1 == con.limit && isEOLChar( target .charAt(  offset ) )
 1743                             || offset+2 == con.limit &&  target .charAt(  offset )  == CARRIAGE_RETURN
 1744                             &&  target .charAt(  offset+1 )  == LINE_FEED))
 1745                           return -1;
 1746                       break;
 1747   
 1748                   case 'z':
 1749                       if (offset != con.limit)  return -1;
 1750                       break;
 1751   
 1752                   case 'b':
 1753                       if (con.length == 0)  return -1;
 1754                       {
 1755                           int after = getWordType(target, con.start, con.limit, offset, opts);
 1756                           if (after == WT_IGNORE)  return -1;
 1757                           int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
 1758                           if (after == before)  return -1;
 1759                       }
 1760                       break;
 1761   
 1762                   case 'B':
 1763                       if (con.length == 0)
 1764                           go = true;
 1765                       else {
 1766                           int after = getWordType(target, con.start, con.limit, offset, opts);
 1767                           go = after == WT_IGNORE
 1768                                || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
 1769                       }
 1770                       if (!go)  return -1;
 1771                       break;
 1772   
 1773                   case '<':
 1774                       if (con.length == 0 || offset == con.limit)  return -1;
 1775                       if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
 1776                           || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
 1777                           return -1;
 1778                       break;
 1779   
 1780                   case '>':
 1781                       if (con.length == 0 || offset == con.start)  return -1;
 1782                       if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
 1783                           || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
 1784                           return -1;
 1785                       break;
 1786                   } // switch anchor type
 1787                   op = op.next;
 1788                   break;
 1789   
 1790               case Op.BACKREFERENCE:
 1791                   {
 1792                       int refno = op.getData();
 1793                       if (refno <= 0 || refno >= this.nofparen)
 1794                           throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
 1795                       if (con.match.getBeginning(refno) < 0
 1796                           || con.match.getEnd(refno) < 0)
 1797                           return -1;                // ********
 1798                       int o2 = con.match.getBeginning(refno);
 1799                       int literallen = con.match.getEnd(refno)-o2;
 1800                       if (!isSet(opts, IGNORE_CASE)) {
 1801                           if (dx > 0) {
 1802                               if (!regionMatches(target, offset, con.limit, o2, literallen))
 1803                                   return -1;
 1804                               offset += literallen;
 1805                           } else {
 1806                               if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
 1807                                   return -1;
 1808                               offset -= literallen;
 1809                           }
 1810                       } else {
 1811                           if (dx > 0) {
 1812                               if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
 1813                                   return -1;
 1814                               offset += literallen;
 1815                           } else {
 1816                               if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
 1817                                                            o2, literallen))
 1818                                   return -1;
 1819                               offset -= literallen;
 1820                           }
 1821                       }
 1822                   }
 1823                   op = op.next;
 1824                   break;
 1825               case Op.STRING:
 1826                   {
 1827                       String literal = op.getString();
 1828                       int literallen = literal.length();
 1829                       if (!isSet(opts, IGNORE_CASE)) {
 1830                           if (dx > 0) {
 1831                               if (!regionMatches(target, offset, con.limit, literal, literallen))
 1832                                   return -1;
 1833                               offset += literallen;
 1834                           } else {
 1835                               if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
 1836                                   return -1;
 1837                               offset -= literallen;
 1838                           }
 1839                       } else {
 1840                           if (dx > 0) {
 1841                               if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
 1842                                   return -1;
 1843                               offset += literallen;
 1844                           } else {
 1845                               if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
 1846                                                            literal, literallen))
 1847                                   return -1;
 1848                               offset -= literallen;
 1849                           }
 1850                       }
 1851                   }
 1852                   op = op.next;
 1853                   break;
 1854   
 1855               case Op.CLOSURE:
 1856                   {
 1857                       /*
 1858                        * Saves current position to avoid
 1859                        * zero-width repeats.
 1860                        */
 1861                       int id = op.getData();
 1862                       if (id >= 0) {
 1863                           int previousOffset = con.offsets[id];
 1864                           if (previousOffset < 0 || previousOffset != offset) {
 1865                               con.offsets[id] = offset;
 1866                           } else {
 1867                               con.offsets[id] = -1;
 1868                               op = op.next;
 1869                               break;
 1870                           }
 1871                       }
 1872                       int ret = this. matchString (con, op.getChild(), offset, dx, opts);
 1873                       if (id >= 0)  con.offsets[id] = -1;
 1874                       if (ret >= 0)  return ret;
 1875                       op = op.next;
 1876                   }
 1877                   break;
 1878   
 1879               case Op.QUESTION:
 1880                   {
 1881                       int ret = this. matchString (con, op.getChild(), offset, dx, opts);
 1882                       if (ret >= 0)  return ret;
 1883                       op = op.next;
 1884                   }
 1885                   break;
 1886   
 1887               case Op.NONGREEDYCLOSURE:
 1888               case Op.NONGREEDYQUESTION:
 1889                   {
 1890                       int ret = this. matchString (con, op.next, offset, dx, opts);
 1891                       if (ret >= 0)  return ret;
 1892                       op = op.getChild();
 1893                   }
 1894                   break;
 1895   
 1896               case Op.UNION:
 1897                   for (int i = 0;  i < op.size();  i ++) {
 1898                       int ret = this. matchString (con, op.elementAt(i), offset, dx, opts);
 1899                       if (DEBUG) {
 1900                           System.err.println("UNION: "+i+", ret="+ret);
 1901                       }
 1902                       if (ret >= 0)  return ret;
 1903                   }
 1904                   return -1;
 1905   
 1906               case Op.CAPTURE:
 1907                   int refno = op.getData();
 1908                   if (con.match != null && refno > 0) {
 1909                       int save = con.match.getBeginning(refno);
 1910                       con.match.setBeginning(refno, offset);
 1911                       int ret = this. matchString (con, op.next, offset, dx, opts);
 1912                       if (ret < 0)  con.match.setBeginning(refno, save);
 1913                       return ret;
 1914                   } else if (con.match != null && refno < 0) {
 1915                       int index = -refno;
 1916                       int save = con.match.getEnd(index);
 1917                       con.match.setEnd(index, offset);
 1918                       int ret = this. matchString (con, op.next, offset, dx, opts);
 1919                       if (ret < 0)  con.match.setEnd(index, save);
 1920                       return ret;
 1921                   }
 1922                   op = op.next;
 1923                   break;
 1924   
 1925               case Op.LOOKAHEAD:
 1926                   if (0 > this. matchString (con, op.getChild(), offset, 1, opts))  return -1;
 1927                   op = op.next;
 1928                   break;
 1929               case Op.NEGATIVELOOKAHEAD:
 1930                   if (0 <= this. matchString (con, op.getChild(), offset, 1, opts))  return -1;
 1931                   op = op.next;
 1932                   break;
 1933               case Op.LOOKBEHIND:
 1934                   if (0 > this. matchString (con, op.getChild(), offset, -1, opts))  return -1;
 1935                   op = op.next;
 1936                   break;
 1937               case Op.NEGATIVELOOKBEHIND:
 1938                   if (0 <= this. matchString (con, op.getChild(), offset, -1, opts))  return -1;
 1939                   op = op.next;
 1940                   break;
 1941   
 1942               case Op.INDEPENDENT:
 1943                   {
 1944                       int ret = this. matchString (con, op.getChild(), offset, dx, opts);
 1945                       if (ret < 0)  return ret;
 1946                       offset = ret;
 1947                       op = op.next;
 1948                   }
 1949                   break;
 1950   
 1951               case Op.MODIFIER:
 1952                   {
 1953                       int localopts = opts;
 1954                       localopts |= op.getData();
 1955                       localopts &= ~op.getData2();
 1956                       //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
 1957                       int ret = this. matchString (con, op.getChild(), offset, dx, localopts);
 1958                       if (ret < 0)  return ret;
 1959                       offset = ret;
 1960                       op = op.next;
 1961                   }
 1962                   break;
 1963   
 1964               case Op.CONDITION:
 1965                   {
 1966                       Op.ConditionOp cop = (Op.ConditionOp)op;
 1967                       boolean matchp = false;
 1968                       if (cop.refNumber > 0) {
 1969                           if (cop.refNumber >= this.nofparen)
 1970                               throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
 1971                           matchp = con.match.getBeginning(cop.refNumber) >= 0
 1972                                    && con.match.getEnd(cop.refNumber) >= 0;
 1973                       } else {
 1974                           matchp = 0 <= this. matchString (con, cop.condition, offset, dx, opts);
 1975                       }
 1976   
 1977                       if (matchp) {
 1978                           op = cop.yes;
 1979                       } else if (cop.no != null) {
 1980                           op = cop.no;
 1981                       } else {
 1982                           op = cop.next;
 1983                       }
 1984                   }
 1985                   break;
 1986   
 1987               default:
 1988                   throw new RuntimeException("Unknown operation type: "+op.type);
 1989               } // switch (op.type)
 1990           } // while
 1991       }
 1992   
 1993       private static final int getPreviousWordType(String  target, int begin, int end,
 1994                                                    int offset, int opts) {
 1995           int ret = getWordType(target, begin, end, --offset, opts);
 1996           while (ret == WT_IGNORE)
 1997               ret = getWordType(target, begin, end, --offset, opts);
 1998           return ret;
 1999       }
 2000   
 2001       private static final int getWordType(String  target, int begin, int end,
 2002                                            int offset, int opts) {
 2003           if (offset < begin || offset >= end)  return WT_OTHER;
 2004           return getWordType0( target .charAt(  offset ) , opts);
 2005       }
 2006   
 2007   
 2008       private static final boolean regionMatches(String text, int offset, int limit,
 2009                                                  String part, int partlen) {
 2010           if (limit-offset < partlen)  return false;
 2011           return text.regionMatches(offset, part, 0, partlen);
 2012       }
 2013   
 2014       private static final boolean regionMatches(String text, int offset, int limit,
 2015                                                  int offset2, int partlen) {
 2016           if (limit-offset < partlen)  return false;
 2017           return text.regionMatches(offset, text, offset2, partlen);
 2018       }
 2019   
 2020       private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit,
 2021                                                            String part, int partlen) {
 2022           return text.regionMatches(true, offset, part, 0, partlen);
 2023       }
 2024   
 2025       private static final boolean regionMatchesIgnoreCase(String text, int offset, int limit,
 2026                                                            int offset2, int partlen) {
 2027           if (limit-offset < partlen)  return false;
 2028           return text.regionMatches(true, offset, text, offset2, partlen);
 2029       }
 2030   
 2031   
 2032   
 2033   
 2034   
 2035   
 2036   
 2037       /**
 2038        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 2039        *
 2040        * @return true if the target is matched to this regular expression.
 2041        */
 2042       public boolean matches(CharacterIterator target) {
 2043           return this.matches(target, (Match)null);
 2044       }
 2045   
 2046   
 2047       /**
 2048        * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
 2049        *
 2050        * @param match A Match instance for storing matching result.
 2051        * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
 2052        */
 2053       public boolean matches(CharacterIterator  target, Match match) {
 2054           int start = target.getBeginIndex();
 2055           int end = target.getEndIndex();
 2056   
 2057   
 2058   
 2059           synchronized (this) {
 2060               if (this.operations == null)
 2061                   this.prepare();
 2062               if (this.context == null)
 2063                   this.context = new Context();
 2064           }
 2065           Context con = null;
 2066           synchronized (this.context) {
 2067               con = this.context.inuse ? new Context() : this.context;
 2068               con.reset(target, start, end, this.numberOfClosures);
 2069           }
 2070           if (match != null) {
 2071               match.setNumberOfGroups(this.nofparen);
 2072               match.setSource(target);
 2073           } else if (this.hasBackReferences) {
 2074               match = new Match();
 2075               match.setNumberOfGroups(this.nofparen);
 2076               // Need not to call setSource() because
 2077               // a caller can not access this match instance.
 2078           }
 2079           con.match = match;
 2080   
 2081           if (RegularExpression.isSet(this.options, XMLSCHEMA_MODE)) {
 2082               int matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
 2083               //System.err.println("DEBUG: matchEnd="+matchEnd);
 2084               if (matchEnd == con.limit) {
 2085                   if (con.match != null) {
 2086                       con.match.setBeginning(0, con.start);
 2087                       con.match.setEnd(0, matchEnd);
 2088                   }
 2089                   con.inuse = false;
 2090                   return true;
 2091               }
 2092               return false;
 2093           }
 2094   
 2095           /*
 2096            * The pattern has only fixed string.
 2097            * The engine uses Boyer-Moore.
 2098            */
 2099           if (this.fixedStringOnly) {
 2100               //System.err.println("DEBUG: fixed-only: "+this.fixedString);
 2101               int o = this.fixedStringTable.matches(target, con.start, con.limit);
 2102               if (o >= 0) {
 2103                   if (con.match != null) {
 2104                       con.match.setBeginning(0, o);
 2105                       con.match.setEnd(0, o+this.fixedString.length());
 2106                   }
 2107                   con.inuse = false;
 2108                   return true;
 2109               }
 2110               con.inuse = false;
 2111               return false;
 2112           }
 2113   
 2114           /*
 2115            * The pattern contains a fixed string.
 2116            * The engine checks with Boyer-Moore whether the text contains the fixed string or not.
 2117            * If not, it return with false.
 2118            */
 2119           if (this.fixedString != null) {
 2120               int o = this.fixedStringTable.matches(target, con.start, con.limit);
 2121               if (o < 0) {
 2122                   //System.err.println("Non-match in fixed-string search.");
 2123                   con.inuse = false;
 2124                   return false;
 2125               }
 2126           }
 2127   
 2128           int limit = con.limit-this.minlength;
 2129           int matchStart;
 2130           int matchEnd = -1;
 2131   
 2132           /*
 2133            * Checks whether the expression starts with ".*".
 2134            */
 2135           if (this.operations != null
 2136               && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
 2137               if (isSet(this.options, SINGLE_LINE)) {
 2138                   matchStart = con.start;
 2139                   matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
 2140               } else {
 2141                   boolean previousIsEOL = true;
 2142                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 2143                       int ch =  target .setIndex(  matchStart ) ;
 2144                       if (isEOLChar(ch)) {
 2145                           previousIsEOL = true;
 2146                       } else {
 2147                           if (previousIsEOL) {
 2148                               if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
 2149                                                                                  matchStart, 1, this.options)))
 2150                                   break;
 2151                           }
 2152                           previousIsEOL = false;
 2153                       }
 2154                   }
 2155               }
 2156           }
 2157   
 2158           /*
 2159            * Optimization against the first character.
 2160            */
 2161           else if (this.firstChar != null) {
 2162               //System.err.println("DEBUG: with firstchar-matching: "+this.firstChar);
 2163               RangeToken range = this.firstChar;
 2164               if (RegularExpression.isSet(this.options, IGNORE_CASE)) {
 2165                   range = this.firstChar.getCaseInsensitiveToken();
 2166                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 2167                       int ch =  target .setIndex(  matchStart ) ;
 2168                       if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit) {
 2169                           ch = REUtil.composeFromSurrogates(ch,  target .setIndex(  matchStart+1 ) );
 2170                           if (!range.match(ch))  continue;
 2171                       } else {
 2172                           if (!range.match(ch)) {
 2173                               char ch1 = Character.toUpperCase((char)ch);
 2174                               if (!range.match(ch1))
 2175                                   if (!range.match(Character.toLowerCase(ch1)))
 2176                                       continue;
 2177                           }
 2178                       }
 2179                       if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
 2180                                                                          matchStart, 1, this.options)))
 2181                           break;
 2182                   }
 2183               } else {
 2184                   for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 2185                       int ch =  target .setIndex(  matchStart ) ;
 2186                       if (REUtil.isHighSurrogate(ch) && matchStart+1 < con.limit)
 2187                           ch = REUtil.composeFromSurrogates(ch,  target .setIndex(  matchStart+1 ) );
 2188                       if (!range.match(ch))  continue;
 2189                       if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations,
 2190                                                                          matchStart, 1, this.options)))
 2191                           break;
 2192                   }
 2193               }
 2194           }
 2195   
 2196           /*
 2197            * Straightforward matching.
 2198            */
 2199           else {
 2200               for (matchStart = con.start;  matchStart <= limit;  matchStart ++) {
 2201                   if (0 <= (matchEnd = this. matchCharacterIterator (con, this.operations, matchStart, 1, this.options)))
 2202                       break;
 2203               }
 2204           }
 2205   
 2206           if (matchEnd >= 0) {
 2207               if (con.match != null) {
 2208                   con.match.setBeginning(0, matchStart);
 2209                   con.match.setEnd(0, matchEnd);
 2210               }
 2211               con.inuse = false;
 2212               return true;
 2213           } else {
 2214               con.inuse = false;
 2215               return false;
 2216           }
 2217       }
 2218   
 2219       /**
 2220        * @return -1 when not match; offset of the end of matched string when match.
 2221        */
 2222       private int matchCharacterIterator (Context con, Op op, int offset, int dx, int opts) {
 2223   
 2224   
 2225           CharacterIterator target = con.ciTarget;
 2226   
 2227   
 2228   
 2229   
 2230   
 2231   
 2232           while (true) {
 2233               if (op == null)
 2234                   return isSet(opts, XMLSCHEMA_MODE) && offset != con.limit ? -1 : offset;
 2235               if (offset > con.limit || offset < con.start)
 2236                   return -1;
 2237               switch (op.type) {
 2238               case Op.CHAR:
 2239                   if (isSet(opts, IGNORE_CASE)) {
 2240                       int ch = op.getData();
 2241                       if (dx > 0) {
 2242                           if (offset >= con.limit || !matchIgnoreCase(ch,  target .setIndex(  offset ) ))
 2243                               return -1;
 2244                           offset ++;
 2245                       } else {
 2246                           int o1 = offset-1;
 2247                           if (o1 >= con.limit || o1 < 0 || !matchIgnoreCase(ch,  target .setIndex(  o1 ) ))
 2248                               return -1;
 2249                           offset = o1;
 2250                       }
 2251                   } else {
 2252                       int ch = op.getData();
 2253                       if (dx > 0) {
 2254                           if (offset >= con.limit || ch !=  target .setIndex(  offset ) )
 2255                               return -1;
 2256                           offset ++;
 2257                       } else {
 2258                           int o1 = offset-1;
 2259                           if (o1 >= con.limit || o1 < 0 || ch !=  target .setIndex(  o1 ) )
 2260                               return -1;
 2261                           offset = o1;
 2262                       }
 2263                   }
 2264                   op = op.next;
 2265                   break;
 2266   
 2267               case Op.DOT:
 2268                   if (dx > 0) {
 2269                       if (offset >= con.limit)
 2270                           return -1;
 2271                       int ch =  target .setIndex(  offset ) ;
 2272                       if (isSet(opts, SINGLE_LINE)) {
 2273                           if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
 2274                               offset ++;
 2275                       } else {
 2276                           if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
 2277                               ch = REUtil.composeFromSurrogates(ch,  target .setIndex(  ++offset ) );
 2278                           if (isEOLChar(ch))
 2279                               return -1;
 2280                       }
 2281                       offset ++;
 2282                   } else {
 2283                       int o1 = offset-1;
 2284                       if (o1 >= con.limit || o1 < 0)
 2285                           return -1;
 2286                       int ch =  target .setIndex(  o1 ) ;
 2287                       if (isSet(opts, SINGLE_LINE)) {
 2288                           if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
 2289                               o1 --;
 2290                       } else {
 2291                           if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
 2292                               ch = REUtil.composeFromSurrogates( target .setIndex(  --o1 ) , ch);
 2293                           if (!isEOLChar(ch))
 2294                               return -1;
 2295                       }
 2296                       offset = o1;
 2297                   }
 2298                   op = op.next;
 2299                   break;
 2300   
 2301               case Op.RANGE:
 2302               case Op.NRANGE:
 2303                   if (dx > 0) {
 2304                       if (offset >= con.limit)
 2305                           return -1;
 2306                       int ch =  target .setIndex(  offset ) ;
 2307                       if (REUtil.isHighSurrogate(ch) && offset+1 < con.limit)
 2308                           ch = REUtil.composeFromSurrogates(ch,  target .setIndex(  ++offset ) );
 2309                       RangeToken tok = op.getToken();
 2310                       if (isSet(opts, IGNORE_CASE)) {
 2311                           tok = tok.getCaseInsensitiveToken();
 2312                           if (!tok.match(ch)) {
 2313                               if (ch >= 0x10000)  return -1;
 2314                               char uch;
 2315                               if (!tok.match(uch = Character.toUpperCase((char)ch))
 2316                                   && !tok.match(Character.toLowerCase(uch)))
 2317                                   return -1;
 2318                           }
 2319                       } else {
 2320                           if (!tok.match(ch))  return -1;
 2321                       }
 2322                       offset ++;
 2323                   } else {
 2324                       int o1 = offset-1;
 2325                       if (o1 >= con.limit || o1 < 0)
 2326                           return -1;
 2327                       int ch =  target .setIndex(  o1 ) ;
 2328                       if (REUtil.isLowSurrogate(ch) && o1-1 >= 0)
 2329                           ch = REUtil.composeFromSurrogates( target .setIndex(  --o1 ) , ch);
 2330                       RangeToken tok = op.getToken();
 2331                       if (isSet(opts, IGNORE_CASE)) {
 2332                           tok = tok.getCaseInsensitiveToken();
 2333                           if (!tok.match(ch)) {
 2334                               if (ch >= 0x10000)  return -1;
 2335                               char uch;
 2336                               if (!tok.match(uch = Character.toUpperCase((char)ch))
 2337                                   && !tok.match(Character.toLowerCase(uch)))
 2338                                   return -1;
 2339                           }
 2340                       } else {
 2341                           if (!tok.match(ch))  return -1;
 2342                       }
 2343                       offset = o1;
 2344                   }
 2345                   op = op.next;
 2346                   break;
 2347   
 2348               case Op.ANCHOR:
 2349                   boolean go = false;
 2350                   switch (op.getData()) {
 2351                   case '^':
 2352                       if (isSet(opts, MULTIPLE_LINES)) {
 2353                           if (!(offset == con.start
 2354                                 || offset > con.start && isEOLChar( target .setIndex(  offset-1 ) )))
 2355                               return -1;
 2356                       } else {
 2357                           if (offset != con.start)
 2358                               return -1;
 2359                       }
 2360                       break;
 2361   
 2362                   case '@':                         // Internal use only.
 2363                       // The @ always matches line beginnings.
 2364                       if (!(offset == con.start
 2365                             || offset > con.start && isEOLChar( target .setIndex(  offset-1 ) )))
 2366                           return -1;
 2367                       break;
 2368   
 2369                   case '$':
 2370                       if (isSet(opts, MULTIPLE_LINES)) {
 2371                           if (!(offset == con.limit
 2372                                 || offset < con.limit && isEOLChar( target .setIndex(  offset ) )))
 2373                               return -1;
 2374                       } else {
 2375                           if (!(offset == con.limit
 2376                                 || offset+1 == con.limit && isEOLChar( target .setIndex(  offset ) )
 2377                                 || offset+2 == con.limit &&  target .setIndex(  offset )  == CARRIAGE_RETURN
 2378                                 &&  target .setIndex(  offset+1 )  == LINE_FEED))
 2379                               return -1;
 2380                       }
 2381                       break;
 2382   
 2383                   case 'A':
 2384                       if (offset != con.start)  return -1;
 2385                       break;
 2386   
 2387                   case 'Z':
 2388                       if (!(offset == con.limit
 2389                             || offset+1 == con.limit && isEOLChar( target .setIndex(  offset ) )
 2390                             || offset+2 == con.limit &&  target .setIndex(  offset )  == CARRIAGE_RETURN
 2391                             &&  target .setIndex(  offset+1 )  == LINE_FEED))
 2392                           return -1;
 2393                       break;
 2394   
 2395                   case 'z':
 2396                       if (offset != con.limit)  return -1;
 2397                       break;
 2398   
 2399                   case 'b':
 2400                       if (con.length == 0)  return -1;
 2401                       {
 2402                           int after = getWordType(target, con.start, con.limit, offset, opts);
 2403                           if (after == WT_IGNORE)  return -1;
 2404                           int before = getPreviousWordType(target, con.start, con.limit, offset, opts);
 2405                           if (after == before)  return -1;
 2406                       }
 2407                       break;
 2408   
 2409                   case 'B':
 2410                       if (con.length == 0)
 2411                           go = true;
 2412                       else {
 2413                           int after = getWordType(target, con.start, con.limit, offset, opts);
 2414                           go = after == WT_IGNORE
 2415                                || after == getPreviousWordType(target, con.start, con.limit, offset, opts);
 2416                       }
 2417                       if (!go)  return -1;
 2418                       break;
 2419   
 2420                   case '<':
 2421                       if (con.length == 0 || offset == con.limit)  return -1;
 2422                       if (getWordType(target, con.start, con.limit, offset, opts) != WT_LETTER
 2423                           || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_OTHER)
 2424                           return -1;
 2425                       break;
 2426   
 2427                   case '>':
 2428                       if (con.length == 0 || offset == con.start)  return -1;
 2429                       if (getWordType(target, con.start, con.limit, offset, opts) != WT_OTHER
 2430                           || getPreviousWordType(target, con.start, con.limit, offset, opts) != WT_LETTER)
 2431                           return -1;
 2432                       break;
 2433                   } // switch anchor type
 2434                   op = op.next;
 2435                   break;
 2436   
 2437               case Op.BACKREFERENCE:
 2438                   {
 2439                       int refno = op.getData();
 2440                       if (refno <= 0 || refno >= this.nofparen)
 2441                           throw new RuntimeException("Internal Error: Reference number must be more than zero: "+refno);
 2442                       if (con.match.getBeginning(refno) < 0
 2443                           || con.match.getEnd(refno) < 0)
 2444                           return -1;                // ********
 2445                       int o2 = con.match.getBeginning(refno);
 2446                       int literallen = con.match.getEnd(refno)-o2;
 2447                       if (!isSet(opts, IGNORE_CASE)) {
 2448                           if (dx > 0) {
 2449                               if (!regionMatches(target, offset, con.limit, o2, literallen))
 2450                                   return -1;
 2451                               offset += literallen;
 2452                           } else {
 2453                               if (!regionMatches(target, offset-literallen, con.limit, o2, literallen))
 2454                                   return -1;
 2455                               offset -= literallen;
 2456                           }
 2457                       } else {
 2458                           if (dx > 0) {
 2459                               if (!regionMatchesIgnoreCase(target, offset, con.limit, o2, literallen))
 2460                                   return -1;
 2461                               offset += literallen;
 2462                           } else {
 2463                               if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
 2464                                                            o2, literallen))
 2465                                   return -1;
 2466                               offset -= literallen;
 2467                           }
 2468                       }
 2469                   }
 2470                   op = op.next;
 2471                   break;
 2472               case Op.STRING:
 2473                   {
 2474                       String literal = op.getString();
 2475                       int literallen = literal.length();
 2476                       if (!isSet(opts, IGNORE_CASE)) {
 2477                           if (dx > 0) {
 2478                               if (!regionMatches(target, offset, con.limit, literal, literallen))
 2479                                   return -1;
 2480                               offset += literallen;
 2481                           } else {
 2482                               if (!regionMatches(target, offset-literallen, con.limit, literal, literallen))
 2483                                   return -1;
 2484                               offset -= literallen;
 2485                           }
 2486                       } else {
 2487                           if (dx > 0) {
 2488                               if (!regionMatchesIgnoreCase(target, offset, con.limit, literal, literallen))
 2489                                   return -1;
 2490                               offset += literallen;
 2491                           } else {
 2492                               if (!regionMatchesIgnoreCase(target, offset-literallen, con.limit,
 2493                                                            literal, literallen))
 2494                                   return -1;
 2495                               offset -= literallen;
 2496                           }
 2497                       }
 2498                   }
 2499                   op = op.next;
 2500                   break;
 2501   
 2502               case Op.CLOSURE:
 2503                   {
 2504                       /*
 2505                        * Saves current position to avoid
 2506                        * zero-width repeats.
 2507                        */
 2508                       int id = op.getData();
 2509                       if (id >= 0) {
 2510                           int previousOffset = con.offsets[id];
 2511                           if (previousOffset < 0 || previousOffset != offset) {
 2512                               con.offsets[id] = offset;
 2513                           } else {
 2514                               con.offsets[id] = -1;
 2515                               op = op.next;
 2516                               break;
 2517                           }
 2518                       }
 2519                       
 2520                       int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
 2521                       if (id >= 0)  con.offsets[id] = -1;
 2522                       if (ret >= 0)  return ret;
 2523                       op = op.next;
 2524                   }
 2525                   break;
 2526   
 2527               case Op.QUESTION:
 2528                   {
 2529                       int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
 2530                       if (ret >= 0)  return ret;
 2531                       op = op.next;
 2532                   }
 2533                   break;
 2534   
 2535               case Op.NONGREEDYCLOSURE:
 2536               case Op.NONGREEDYQUESTION:
 2537                   {
 2538                       int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
 2539                       if (ret >= 0)  return ret;
 2540                       op = op.getChild();
 2541                   }
 2542                   break;
 2543   
 2544               case Op.UNION:
 2545                   for (int i = 0;  i < op.size();  i ++) {
 2546                       int ret = this. matchCharacterIterator (con, op.elementAt(i), offset, dx, opts);
 2547                       if (DEBUG) {
 2548                           System.err.println("UNION: "+i+", ret="+ret);
 2549                       }
 2550                       if (ret >= 0)  return ret;
 2551                   }
 2552                   return -1;
 2553   
 2554               case Op.CAPTURE:
 2555                   int refno = op.getData();
 2556                   if (con.match != null && refno > 0) {
 2557                       int save = con.match.getBeginning(refno);
 2558                       con.match.setBeginning(refno, offset);
 2559                       int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
 2560                       if (ret < 0)  con.match.setBeginning(refno, save);
 2561                       return ret;
 2562                   } else if (con.match != null && refno < 0) {
 2563                       int index = -refno;
 2564                       int save = con.match.getEnd(index);
 2565                       con.match.setEnd(index, offset);
 2566                       int ret = this. matchCharacterIterator (con, op.next, offset, dx, opts);
 2567                       if (ret < 0)  con.match.setEnd(index, save);
 2568                       return ret;
 2569                   }
 2570                   op = op.next;
 2571                   break;
 2572   
 2573               case Op.LOOKAHEAD:
 2574                   if (0 > this. matchCharacterIterator (con, op.getChild(), offset, 1, opts))  return -1;
 2575                   op = op.next;
 2576                   break;
 2577               case Op.NEGATIVELOOKAHEAD:
 2578                   if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, 1, opts))  return -1;
 2579                   op = op.next;
 2580                   break;
 2581               case Op.LOOKBEHIND:
 2582                   if (0 > this. matchCharacterIterator (con, op.getChild(), offset, -1, opts))  return -1;
 2583                   op = op.next;
 2584                   break;
 2585               case Op.NEGATIVELOOKBEHIND:
 2586                   if (0 <= this. matchCharacterIterator (con, op.getChild(), offset, -1, opts))  return -1;
 2587                   op = op.next;
 2588                   break;
 2589   
 2590               case Op.INDEPENDENT:
 2591                   {
 2592                       int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, opts);
 2593                       if (ret < 0)  return ret;
 2594                       offset = ret;
 2595                       op = op.next;
 2596                   }
 2597                   break;
 2598   
 2599               case Op.MODIFIER:
 2600                   {
 2601                       int localopts = opts;
 2602                       localopts |= op.getData();
 2603                       localopts &= ~op.getData2();
 2604                       //System.err.println("MODIFIER: "+Integer.toString(opts, 16)+" -> "+Integer.toString(localopts, 16));
 2605                       int ret = this. matchCharacterIterator (con, op.getChild(), offset, dx, localopts);
 2606                       if (ret < 0)  return ret;
 2607                       offset = ret;
 2608                       op = op.next;
 2609                   }
 2610                   break;
 2611   
 2612               case Op.CONDITION:
 2613                   {
 2614                       Op.ConditionOp cop = (Op.ConditionOp)op;
 2615                       boolean matchp = false;
 2616                       if (cop.refNumber > 0) {
 2617                           if (cop.refNumber >= this.nofparen)
 2618                               throw new RuntimeException("Internal Error: Reference number must be more than zero: "+cop.refNumber);
 2619                           matchp = con.match.getBeginning(cop.refNumber) >= 0
 2620                                    && con.match.getEnd(cop.refNumber) >= 0;
 2621                       } else {
 2622                           matchp = 0 <= this. matchCharacterIterator (con, cop.condition, offset, dx, opts);
 2623                       }
 2624   
 2625                       if (matchp) {
 2626                           op = cop.yes;
 2627                       } else if (cop.no != null) {
 2628                           op = cop.no;
 2629                       } else {
 2630                           op = cop.next;
 2631                       }
 2632                   }
 2633                   break;
 2634   
 2635               default:
 2636                   throw new RuntimeException("Unknown operation type: "+op.type);
 2637               } // switch (op.type)
 2638           } // while
 2639       }
 2640   
 2641       private static final int getPreviousWordType(CharacterIterator  target, int begin, int end,
 2642                                                    int offset, int opts) {
 2643           int ret = getWordType(target, begin, end, --offset, opts);
 2644           while (ret == WT_IGNORE)
 2645               ret = getWordType(target, begin, end, --offset, opts);
 2646           return ret;
 2647       }
 2648   
 2649       private static final int getWordType(CharacterIterator  target, int begin, int end,
 2650                                            int offset, int opts) {
 2651           if (offset < begin || offset >= end)  return WT_OTHER;
 2652           return getWordType0( target .setIndex(  offset ) , opts);
 2653       }
 2654   
 2655   
 2656   
 2657       private static final boolean regionMatches(CharacterIterator  target, int offset, int limit,
 2658                                                  String part, int partlen) {
 2659           if (offset < 0)  return false;
 2660           if (limit-offset < partlen)
 2661               return false;
 2662           int i = 0;
 2663           while (partlen-- > 0) {
 2664               if ( target .setIndex(  offset++ )  != part.charAt(i++))
 2665                   return false;
 2666           }
 2667           return true;
 2668       }
 2669   
 2670       private static final boolean regionMatches(CharacterIterator  target, int offset, int limit,
 2671                                                  int offset2, int partlen) {
 2672           if (offset < 0)  return false;
 2673           if (limit-offset < partlen)
 2674               return false;
 2675           int i = offset2;
 2676           while (partlen-- > 0) {
 2677               if ( target .setIndex(  offset++ )  !=  target .setIndex(  i++ ) )
 2678                   return false;
 2679           }
 2680           return true;
 2681       }
 2682   
 2683       /**
 2684        * @see java.lang.String#regionMatches
 2685        */
 2686       private static final boolean regionMatchesIgnoreCase(CharacterIterator  target, int offset, int limit,
 2687                                                            String part, int partlen) {
 2688           if (offset < 0)  return false;
 2689           if (limit-offset < partlen)
 2690               return false;
 2691           int i = 0;
 2692           while (partlen-- > 0) {
 2693               char ch1 =  target .setIndex(  offset++ ) ;
 2694               char ch2 = part.charAt(i++);
 2695               if (ch1 == ch2)
 2696                   continue;
 2697               char uch1 = Character.toUpperCase(ch1);
 2698               char uch2 = Character.toUpperCase(ch2);
 2699               if (uch1 == uch2)
 2700                   continue;
 2701               if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
 2702                   return false;
 2703           }
 2704           return true;
 2705       }
 2706   
 2707       private static final boolean regionMatchesIgnoreCase(CharacterIterator  target, int offset, int limit,
 2708                                                            int offset2, int partlen) {
 2709           if (offset < 0)  return false;
 2710           if (limit-offset < partlen)
 2711               return false;
 2712           int i = offset2;
 2713           while (partlen-- > 0) {
 2714               char ch1 =  target .setIndex(  offset++ ) ;
 2715               char ch2 =  target .setIndex(  i++ ) ;
 2716               if (ch1 == ch2)
 2717                   continue;
 2718               char uch1 = Character.toUpperCase(ch1);
 2719               char uch2 = Character.toUpperCase(ch2);
 2720               if (uch1 == uch2)
 2721                   continue;
 2722               if (Character.toLowerCase(uch1) != Character.toLowerCase(uch2))
 2723                   return false;
 2724           }
 2725           return true;
 2726       }
 2727   
 2728   
 2729   
 2730   
 2731       // ================================================================
 2732   
 2733       /**
 2734        * A regular expression.
 2735        * @serial
 2736        */
 2737       String regex;
 2738       /**
 2739        * @serial
 2740        */
 2741       int options;
 2742   
 2743       /**
 2744        * The number of parenthesis in the regular expression.
 2745        * @serial
 2746        */
 2747       int nofparen;
 2748       /**
 2749        * Internal representation of the regular expression.
 2750        * @serial
 2751        */
 2752       Token tokentree;
 2753   
 2754       boolean hasBackReferences = false;
 2755   
 2756       transient int minlength;
 2757       transient Op operations = null;
 2758       transient int numberOfClosures;
 2759       transient Context context = null;
 2760       transient RangeToken firstChar = null;
 2761   
 2762       transient String fixedString = null;
 2763       transient int fixedStringOptions;
 2764       transient BMPattern fixedStringTable = null;
 2765       transient boolean fixedStringOnly = false;
 2766   
 2767   
 2768       static final class Context {
 2769           CharacterIterator ciTarget;
 2770           String strTarget;
 2771           char[] charTarget;
 2772           int start;
 2773           int limit;
 2774           int length;
 2775           Match match;
 2776           boolean inuse = false;
 2777           int[] offsets;
 2778   
 2779           Context() {
 2780           }
 2781   
 2782           private void resetCommon(int nofclosures) {
 2783               this.length = this.limit-this.start;
 2784               this.inuse = true;
 2785               this.match = null;
 2786               if (this.offsets == null || this.offsets.length != nofclosures)
 2787                   this.offsets = new int[nofclosures];
 2788               for (int i = 0;  i < nofclosures;  i ++)  this.offsets[i] = -1;
 2789           }
 2790           void reset(CharacterIterator target, int start, int limit, int nofclosures) {
 2791               this.ciTarget = target;
 2792               this.start = start;
 2793               this.limit = limit;
 2794               this.resetCommon(nofclosures);
 2795           }
 2796           void reset(String target, int start, int limit, int nofclosures) {
 2797               this.strTarget = target;
 2798               this.start = start;
 2799               this.limit = limit;
 2800               this.resetCommon(nofclosures);
 2801           }
 2802           void reset(char[] target, int start, int limit, int nofclosures) {
 2803               this.charTarget = target;
 2804               this.start = start;
 2805               this.limit = limit;
 2806               this.resetCommon(nofclosures);
 2807           }
 2808       }
 2809   
 2810       /**
 2811        * Prepares for matching.  This method is called just before starting matching.
 2812        */
 2813       void prepare() {
 2814           if (Op.COUNT)  Op.nofinstances = 0;
 2815           this.compile(this.tokentree);
 2816           /*
 2817           if  (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) { // .*
 2818               Op anchor = Op.createAnchor(isSet(this.options, SINGLE_LINE) ? 'A' : '@');
 2819               anchor.next = this.operations;
 2820               this.operations = anchor;
 2821           }
 2822           */
 2823           if (Op.COUNT)  System.err.println("DEBUG: The number of operations: "+Op.nofinstances);
 2824   
 2825           this.minlength = this.tokentree.getMinLength();
 2826   
 2827           this.firstChar = null;
 2828           if (!isSet(this.options, PROHIBIT_HEAD_CHARACTER_OPTIMIZATION)
 2829               && !isSet(this.options, XMLSCHEMA_MODE)) {
 2830               RangeToken firstChar = Token.createRange();
 2831               int fresult = this.tokentree.analyzeFirstCharacter(firstChar, this.options);
 2832               if (fresult == Token.FC_TERMINAL) {
 2833                   firstChar.compactRanges();
 2834                   this.firstChar = firstChar;
 2835                   if (DEBUG)
 2836                       System.err.println("DEBUG: Use the first character optimization: "+firstChar);
 2837               }
 2838           }
 2839   
 2840           if (this.operations != null
 2841               && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
 2842               && this.operations.next == null) {
 2843               if (DEBUG)
 2844                   System.err.print(" *** Only fixed string! *** ");
 2845               this.fixedStringOnly = true;
 2846               if (this.operations.type == Op.STRING)
 2847                   this.fixedString = this.operations.getString();
 2848               else if (this.operations.getData() >= 0x10000) { // Op.CHAR
 2849                   this.fixedString = REUtil.decomposeToSurrogates(this.operations.getData());
 2850               } else {
 2851                   char[] ac = new char[1];
 2852                   ac[0] = (char)this.operations.getData();
 2853                   this.fixedString = new String(ac);
 2854               }
 2855               this.fixedStringOptions = this.options;
 2856               this.fixedStringTable = new BMPattern(this.fixedString, 256,
 2857                                                     isSet(this.fixedStringOptions, IGNORE_CASE));
 2858           } else if (!isSet(this.options, PROHIBIT_FIXED_STRING_OPTIMIZATION)
 2859                      && !isSet(this.options, XMLSCHEMA_MODE)) {
 2860               Token.FixedStringContainer container = new Token.FixedStringContainer();
 2861               this.tokentree.findFixedString(container, this.options);
 2862               this.fixedString = container.token == null ? null : container.token.getString();
 2863               this.fixedStringOptions = container.options;
 2864               if (this.fixedString != null && this.fixedString.length() < 2)
 2865                   this.fixedString = null;
 2866               // This pattern has a fixed string of which length is more than one.
 2867               if (this.fixedString != null) {
 2868                   this.fixedStringTable = new BMPattern(this.fixedString, 256,
 2869                                                         isSet(this.fixedStringOptions, IGNORE_CASE));
 2870                   if (DEBUG) {
 2871                       System.err.println("DEBUG: The longest fixed string: "+this.fixedString.length()
 2872                                          +"/" //+this.fixedString
 2873                                          +"/"+REUtil.createOptionString(this.fixedStringOptions));
 2874                       System.err.print("String: ");
 2875                       REUtil.dumpString(this.fixedString);
 2876                   }
 2877               }
 2878           }
 2879       }
 2880   
 2881       /**
 2882        * An option.
 2883        * If you specify this option, <span class="REGEX"><kbd>(</kbd><var>X</var><kbd>)</kbd></span>
 2884        * captures matched text, and <span class="REGEX"><kbd>(:?</kbd><var>X</var><kbd>)</kbd></span>
 2885        * does not capture.
 2886        *
 2887        * @see #RegularExpression(java.lang.String,int)
 2888        * @see #setPattern(java.lang.String,int)
 2889       static final int MARK_PARENS = 1<<0;
 2890        */
 2891   
 2892       /**
 2893        * "i"
 2894        */
 2895       static final int IGNORE_CASE = 1<<1;
 2896   
 2897       /**
 2898        * "s"
 2899        */
 2900       static final int SINGLE_LINE = 1<<2;
 2901   
 2902       /**
 2903        * "m"
 2904        */
 2905       static final int MULTIPLE_LINES = 1<<3;
 2906   
 2907       /**
 2908        * "x"
 2909        */
 2910       static final int EXTENDED_COMMENT = 1<<4;
 2911   
 2912       /**
 2913        * This option redefines <span class="REGEX"><kbd>\d \D \w \W \s \S</kbd></span>.
 2914        *
 2915        * @see #RegularExpression(java.lang.String,int)
 2916        * @see #setPattern(java.lang.String,int)
 2917        * @see #UNICODE_WORD_BOUNDARY
 2918        */
 2919       static final int USE_UNICODE_CATEGORY = 1<<5; // "u"
 2920   
 2921       /**
 2922        * An option.
 2923        * This enables to process locale-independent word boundary for <span class="REGEX"><kbd>\b \B \&lt; \></kbd></span>.
 2924        * <p>By default, the engine considers a position between a word character
 2925        * (<span class="REGEX"><Kbd>\w</kbd></span>) and a non word character
 2926        * is a word boundary.
 2927        * <p>By this option, the engine checks word boundaries with the method of
 2928        * 'Unicode Regular Expression Guidelines' Revision 4.
 2929        *
 2930        * @see #RegularExpression(java.lang.String,int)
 2931        * @see #setPattern(java.lang.String,int)
 2932        */
 2933       static final int UNICODE_WORD_BOUNDARY = 1<<6; // "w"
 2934   
 2935       /**
 2936        * "H"
 2937        */
 2938       static final int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION = 1<<7;
 2939       /**
 2940        * "F"
 2941        */
 2942       static final int PROHIBIT_FIXED_STRING_OPTIMIZATION = 1<<8;
 2943       /**
 2944        * "X". XML Schema mode.
 2945        */
 2946       static final int XMLSCHEMA_MODE = 1<<9;
 2947       /**
 2948        * ",".
 2949        */
 2950       static final int SPECIAL_COMMA = 1<<10;
 2951   
 2952   
 2953       private static final boolean isSet(int options, int flag) {
 2954           return (options & flag) == flag;
 2955       }
 2956   
 2957       /**
 2958        * Creates a new RegularExpression instance.
 2959        *
 2960        * @param regex A regular expression
 2961        * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
 2962        */
 2963       public RegularExpression(String regex) throws ParseException {
 2964           this.setPattern(regex, null);
 2965       }
 2966   
 2967       /**
 2968        * Creates a new RegularExpression instance with options.
 2969        *
 2970        * @param regex A regular expression
 2971        * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
 2972        * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
 2973        */
 2974       public RegularExpression(String regex, String options) throws ParseException {
 2975           this.setPattern(regex, options);
 2976       }
 2977   
 2978       RegularExpression(String regex, Token tok, int parens, boolean hasBackReferences, int options) {
 2979           this.regex = regex;
 2980           this.tokentree = tok;
 2981           this.nofparen = parens;
 2982           this.options = options;
 2983           this.hasBackReferences = hasBackReferences;
 2984       }
 2985   
 2986       /**
 2987        *
 2988        */
 2989       public void setPattern(String newPattern) throws ParseException {
 2990           this.setPattern(newPattern, this.options);
 2991       }
 2992   
 2993       private void setPattern(String newPattern, int options) throws ParseException {
 2994           this.regex = newPattern;
 2995           this.options = options;
 2996           RegexParser rp = RegularExpression.isSet(this.options, RegularExpression.XMLSCHEMA_MODE)
 2997                            ? new ParserForXMLSchema() : new RegexParser();
 2998           this.tokentree = rp.parse(this.regex, this.options);
 2999           this.nofparen = rp.parennumber;
 3000           this.hasBackReferences = rp.hasBackReferences;
 3001   
 3002           this.operations = null;
 3003           this.context = null;
 3004       }
 3005       /**
 3006        *
 3007        */
 3008       public void setPattern(String newPattern, String options) throws ParseException {
 3009           this.setPattern(newPattern, REUtil.parseOptions(options));
 3010       }
 3011   
 3012       /**
 3013        *
 3014        */
 3015       public String getPattern() {
 3016           return this.regex;
 3017       }
 3018   
 3019       /**
 3020        * Represents this instence in String.
 3021        */
 3022       public String toString() {
 3023           return this.tokentree.toString(this.options);
 3024       }
 3025   
 3026       /**
 3027        * Returns a option string.
 3028        * The order of letters in it may be different from a string specified
 3029        * in a constructor or <code>setPattern()</code>.
 3030        *
 3031        * @see #RegularExpression(java.lang.String,java.lang.String)
 3032        * @see #setPattern(java.lang.String,java.lang.String)
 3033        */
 3034       public String getOptions() {
 3035           return REUtil.createOptionString(this.options);
 3036       }
 3037   
 3038       /**
 3039        *  Return true if patterns are the same and the options are equivalent.
 3040        */
 3041       public boolean equals(Object obj) {
 3042           if (obj == null)  return false;
 3043           if (!(obj instanceof RegularExpression))
 3044               return false;
 3045           RegularExpression r = (RegularExpression)obj;
 3046           return this.regex.equals(r.regex) && this.options == r.options;
 3047       }
 3048   
 3049       boolean equals(String pattern, int options) {
 3050           return this.regex.equals(pattern) && this.options == options;
 3051       }
 3052   
 3053       /**
 3054        *
 3055        */
 3056       public int hashCode() {
 3057           return (this.regex+"/"+this.getOptions()).hashCode();
 3058       }
 3059   
 3060       /**
 3061        * Return the number of regular expression groups.
 3062        * This method returns 1 when the regular expression has no capturing-parenthesis.
 3063        *
 3064        */
 3065       public int getNumberOfGroups() {
 3066           return this.nofparen;
 3067       }
 3068   
 3069       // ================================================================
 3070   
 3071       private static final int WT_IGNORE = 0;
 3072       private static final int WT_LETTER = 1;
 3073       private static final int WT_OTHER = 2;
 3074       private static final int getWordType0(char ch, int opts) {
 3075           if (!isSet(opts, UNICODE_WORD_BOUNDARY)) {
 3076               if (isSet(opts, USE_UNICODE_CATEGORY)) {
 3077                   return (Token.getRange("IsWord", true).match(ch)) ? WT_LETTER : WT_OTHER;
 3078               }
 3079               return isWordChar(ch) ? WT_LETTER : WT_OTHER;
 3080           }
 3081   
 3082           switch (Character.getType(ch)) {
 3083           case Character.UPPERCASE_LETTER:      // L
 3084           case Character.LOWERCASE_LETTER:      // L
 3085           case Character.TITLECASE_LETTER:      // L
 3086           case Character.MODIFIER_LETTER:       // L
 3087           case Character.OTHER_LETTER:          // L
 3088           case Character.LETTER_NUMBER:         // N
 3089           case Character.DECIMAL_DIGIT_NUMBER:  // N
 3090           case Character.OTHER_NUMBER:          // N
 3091           case Character.COMBINING_SPACING_MARK: // Mc
 3092               return WT_LETTER;
 3093   
 3094           case Character.FORMAT:                // Cf
 3095           case Character.NON_SPACING_MARK:      // Mn
 3096           case Character.ENCLOSING_MARK:        // Mc
 3097               return WT_IGNORE;
 3098   
 3099           case Character.CONTROL:               // Cc
 3100               switch (ch) {
 3101               case '\t':
 3102               case '\n':
 3103               case '\u000B':
 3104               case '\f':
 3105               case '\r':
 3106                   return WT_OTHER;
 3107               default:
 3108                   return WT_IGNORE;
 3109               }
 3110   
 3111           default:
 3112               return WT_OTHER;
 3113           }
 3114       }
 3115   
 3116       // ================================================================
 3117   
 3118       static final int LINE_FEED = 0x000A;
 3119       static final int CARRIAGE_RETURN = 0x000D;
 3120       static final int LINE_SEPARATOR = 0x2028;
 3121       static final int PARAGRAPH_SEPARATOR = 0x2029;
 3122   
 3123       private static final boolean isEOLChar(int ch) {
 3124           return ch == LINE_FEED || ch == CARRIAGE_RETURN || ch == LINE_SEPARATOR
 3125           || ch == PARAGRAPH_SEPARATOR;
 3126       }
 3127   
 3128       private static final boolean isWordChar(int ch) { // Legacy word characters
 3129           if (ch == '_')  return true;
 3130           if (ch < '0')  return false;
 3131           if (ch > 'z')  return false;
 3132           if (ch <= '9')  return true;
 3133           if (ch < 'A')  return false;
 3134           if (ch <= 'Z')  return true;
 3135           if (ch < 'a')  return false;
 3136           return true;
 3137       }
 3138   
 3139       private static final boolean matchIgnoreCase(int chardata, int ch) {
 3140           if (chardata == ch)  return true;
 3141           if (chardata > 0xffff || ch > 0xffff)  return false;
 3142           char uch1 = Character.toUpperCase((char)chardata);
 3143           char uch2 = Character.toUpperCase((char)ch);
 3144           if (uch1 == uch2)  return true;
 3145           return Character.toLowerCase(uch1) == Character.toLowerCase(uch2);
 3146       }
 3147   }

Home » xmlbeans-2.5.0-src » org.apache.xmlbeans.impl » regex » [javadoc | source]