Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » ru » [javadoc | source]
    1   package org.apache.lucene.analysis.ru;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   /**
   21    * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
   22    */
   23   class RussianStemmer
   24   {
   25       // positions of RV, R1 and R2 respectively
   26       private int RV, R1, R2;
   27   
   28       // letters (currently unused letters are commented out)
   29       private final static char A = '\u0430';
   30       //private final static char B = '\u0431';
   31       private final static char V = '\u0432';
   32       private final static char G = '\u0433';
   33       //private final static char D = '\u0434';
   34       private final static char E = '\u0435';
   35       //private final static char ZH = '\u0436';
   36       //private final static char Z = '\u0437';
   37       private final static char I = '\u0438';
   38       private final static char I_ = '\u0439';
   39       //private final static char K = '\u043A';
   40       private final static char L = '\u043B';
   41       private final static char M = '\u043C';
   42       private final static char N = '\u043D';
   43       private final static char O = '\u043E';
   44       //private final static char P = '\u043F';
   45       //private final static char R = '\u0440';
   46       private final static char S = '\u0441';
   47       private final static char T = '\u0442';
   48       private final static char U = '\u0443';
   49       //private final static char F = '\u0444';
   50       private final static char X = '\u0445';
   51       //private final static char TS = '\u0446';
   52       //private final static char CH = '\u0447';
   53       private final static char SH = '\u0448';
   54       private final static char SHCH = '\u0449';
   55       //private final static char HARD = '\u044A';
   56       private final static char Y = '\u044B';
   57       private final static char SOFT = '\u044C';
   58       private final static char AE = '\u044D';
   59       private final static char IU = '\u044E';
   60       private final static char IA = '\u044F';
   61   
   62       // stem definitions
   63       private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
   64   
   65       private static char[][] perfectiveGerundEndings1 = {
   66           { V },
   67           { V, SH, I },
   68           { V, SH, I, S, SOFT }
   69       };
   70   
   71       private static char[][] perfectiveGerund1Predessors = {
   72           { A },
   73           { IA }
   74       };
   75   
   76       private static char[][] perfectiveGerundEndings2 = { { I, V }, {
   77           Y, V }, {
   78               I, V, SH, I }, {
   79                   Y, V, SH, I }, {
   80                       I, V, SH, I, S, SOFT }, {
   81                           Y, V, SH, I, S, SOFT }
   82       };
   83   
   84       private static char[][] adjectiveEndings = {
   85           { E, E },
   86           { I, E },
   87           { Y, E },
   88           { O, E },
   89           { E, I_ },
   90           { I, I_ },
   91           { Y, I_ },
   92           { O, I_ },
   93           { E, M },
   94           { I, M },
   95           { Y, M },
   96           { O, M },
   97           { I, X },
   98           { Y, X },
   99           { U, IU },
  100           { IU, IU },
  101           { A, IA },
  102           { IA, IA },
  103           { O, IU },
  104           { E, IU },
  105           { I, M, I },
  106           { Y, M, I },
  107           { E, G, O },
  108           { O, G, O },
  109           { E, M, U },
  110           {O, M, U }
  111       };
  112   
  113       private static char[][] participleEndings1 = {
  114           { SHCH },
  115           { E, M },
  116           { N, N },
  117           { V, SH },
  118           { IU, SHCH }
  119       };
  120   
  121       private static char[][] participleEndings2 = {
  122           { I, V, SH },
  123           { Y, V, SH },
  124           { U, IU, SHCH }
  125       };
  126   
  127       private static char[][] participle1Predessors = {
  128           { A },
  129           { IA }
  130       };
  131   
  132       private static char[][] reflexiveEndings = {
  133           { S, IA },
  134           { S, SOFT }
  135       };
  136   
  137       private static char[][] verbEndings1 = {
  138           { I_ },
  139           { L },
  140           { N },
  141           { L, O },
  142           { N, O },
  143           { E, T },
  144           { IU, T },
  145           { L, A },
  146           { N, A },
  147           { L, I },
  148           { E, M },
  149           { N, Y },
  150           { E, T, E },
  151           { I_, T, E },
  152           { T, SOFT },
  153           { E, SH, SOFT },
  154           { N, N, O }
  155       };
  156   
  157       private static char[][] verbEndings2 = {
  158           { IU },
  159           { U, IU },
  160           { E, N },
  161           { E, I_ },
  162           { IA, T },
  163           { U, I_ },
  164           { I, L },
  165           { Y, L },
  166           { I, M },
  167           { Y, M },
  168           { I, T },
  169           { Y, T },
  170           { I, L, A },
  171           { Y, L, A },
  172           { E, N, A },
  173           { I, T, E },
  174           { I, L, I },
  175           { Y, L, I },
  176           { I, L, O },
  177           { Y, L, O },
  178           { E, N, O },
  179           { U, E, T },
  180           { U, IU, T },
  181           { E, N, Y },
  182           { I, T, SOFT },
  183           { Y, T, SOFT },
  184           { I, SH, SOFT },
  185           { E, I_, T, E },
  186           { U, I_, T, E }
  187       };
  188   
  189       private static char[][] verb1Predessors = {
  190           { A },
  191           { IA }
  192       };
  193   
  194       private static char[][] nounEndings = {
  195           { A },
  196           { U },
  197           { I_ },
  198           { O },
  199           { U },
  200           { E },
  201           { Y },
  202           { I },
  203           { SOFT },
  204           { IA },
  205           { E, V },
  206           { O, V },
  207           { I, E },
  208           { SOFT, E },
  209           { IA, X },
  210           { I, IU },
  211           { E, I },
  212           { I, I },
  213           { E, I_ },
  214           { O, I_ },
  215           { E, M },
  216           { A, M },
  217           { O, M },
  218           { A, X },
  219           { SOFT, IU },
  220           { I, IA },
  221           { SOFT, IA },
  222           { I, I_ },
  223           { IA, M },
  224           { IA, M, I },
  225           { A, M, I },
  226           { I, E, I_ },
  227           { I, IA, M },
  228           { I, E, M },
  229           { I, IA, X },
  230           { I, IA, M, I }
  231       };
  232   
  233       private static char[][] superlativeEndings = {
  234           { E, I_, SH },
  235           { E, I_, SH, E }
  236       };
  237   
  238       private static char[][] derivationalEndings = {
  239           { O, S, T },
  240           { O, S, T, SOFT }
  241       };
  242   
  243       /**
  244        * RussianStemmer constructor comment.
  245        */
  246       public RussianStemmer()
  247       {
  248           super();
  249       }
  250   
  251       /**
  252        * Adjectival ending is an adjective ending,
  253        * optionally preceded by participle ending.
  254        * Creation date: (17/03/2002 12:14:58 AM)
  255        * @param stemmingZone java.lang.StringBuilder
  256        */
  257       private boolean adjectival(StringBuilder stemmingZone)
  258       {
  259           // look for adjective ending in a stemming zone
  260           if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
  261               return false;
  262           // if adjective ending was found, try for participle ending.
  263           // variable r is unused, we are just interested in the side effect of
  264           // findAndRemoveEnding():
  265           boolean r =
  266               findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
  267               ||
  268               findAndRemoveEnding(stemmingZone, participleEndings2);
  269           return true;
  270       }
  271   
  272       /**
  273        * Derivational endings
  274        * Creation date: (17/03/2002 12:14:58 AM)
  275        * @param stemmingZone java.lang.StringBuilder
  276        */
  277       private boolean derivational(StringBuilder stemmingZone)
  278       {
  279           int endingLength = findEnding(stemmingZone, derivationalEndings);
  280           if (endingLength == 0)
  281                // no derivational ending found
  282               return false;
  283           else
  284           {
  285               // Ensure that the ending locates in R2
  286               if (R2 - RV <= stemmingZone.length() - endingLength)
  287               {
  288                   stemmingZone.setLength(stemmingZone.length() - endingLength);
  289                   return true;
  290               }
  291               else
  292               {
  293                   return false;
  294               }
  295           }
  296       }
  297   
  298       /**
  299        * Finds ending among given ending class and returns the length of ending found(0, if not found).
  300        * Creation date: (17/03/2002 8:18:34 PM)
  301        */
  302       private int findEnding(StringBuilder stemmingZone, int startIndex, char[][] theEndingClass)
  303       {
  304           boolean match = false;
  305           for (int i = theEndingClass.length - 1; i >= 0; i--)
  306           {
  307               char[] theEnding = theEndingClass[i];
  308               // check if the ending is bigger than stemming zone
  309               if (startIndex < theEnding.length - 1)
  310               {
  311                   match = false;
  312                   continue;
  313               }
  314               match = true;
  315               int stemmingIndex = startIndex;
  316               for (int j = theEnding.length - 1; j >= 0; j--)
  317               {
  318                   if (stemmingZone.charAt(stemmingIndex--) != theEnding[j])
  319                   {
  320                       match = false;
  321                       break;
  322                   }
  323               }
  324               // check if ending was found
  325               if (match)
  326               {
  327                   return theEndingClass[i].length; // cut ending
  328               }
  329           }
  330           return 0;
  331       }
  332   
  333       private int findEnding(StringBuilder stemmingZone, char[][] theEndingClass)
  334       {
  335           return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
  336       }
  337   
  338       /**
  339        * Finds the ending among the given class of endings and removes it from stemming zone.
  340        * Creation date: (17/03/2002 8:18:34 PM)
  341        */
  342       private boolean findAndRemoveEnding(StringBuilder stemmingZone, char[][] theEndingClass)
  343       {
  344           int endingLength = findEnding(stemmingZone, theEndingClass);
  345           if (endingLength == 0)
  346               // not found
  347               return false;
  348           else {
  349               stemmingZone.setLength(stemmingZone.length() - endingLength);
  350               // cut the ending found
  351               return true;
  352           }
  353       }
  354   
  355       /**
  356        * Finds the ending among the given class of endings, then checks if this ending was
  357        * preceded by any of given predecessors, and if so, removes it from stemming zone.
  358        * Creation date: (17/03/2002 8:18:34 PM)
  359        */
  360       private boolean findAndRemoveEnding(StringBuilder stemmingZone,
  361           char[][] theEndingClass, char[][] thePredessors)
  362       {
  363           int endingLength = findEnding(stemmingZone, theEndingClass);
  364           if (endingLength == 0)
  365               // not found
  366               return false;
  367           else
  368           {
  369               int predessorLength =
  370                   findEnding(stemmingZone,
  371                       stemmingZone.length() - endingLength - 1,
  372                       thePredessors);
  373               if (predessorLength == 0)
  374                   return false;
  375               else {
  376                   stemmingZone.setLength(stemmingZone.length() - endingLength);
  377                   // cut the ending found
  378                   return true;
  379               }
  380           }
  381   
  382       }
  383   
  384       /**
  385        * Marks positions of RV, R1 and R2 in a given word.
  386        * Creation date: (16/03/2002 3:40:11 PM)
  387        */
  388       private void markPositions(String word)
  389       {
  390           RV = 0;
  391           R1 = 0;
  392           R2 = 0;
  393           int i = 0;
  394           // find RV
  395           while (word.length() > i && !isVowel(word.charAt(i)))
  396           {
  397               i++;
  398           }
  399           if (word.length() - 1 < ++i)
  400               return; // RV zone is empty
  401           RV = i;
  402           // find R1
  403           while (word.length() > i && isVowel(word.charAt(i)))
  404           {
  405               i++;
  406           }
  407           if (word.length() - 1 < ++i)
  408               return; // R1 zone is empty
  409           R1 = i;
  410           // find R2
  411           while (word.length() > i && !isVowel(word.charAt(i)))
  412           {
  413               i++;
  414           }
  415           if (word.length() - 1 < ++i)
  416               return; // R2 zone is empty
  417           while (word.length() > i && isVowel(word.charAt(i)))
  418           {
  419               i++;
  420           }
  421           if (word.length() - 1 < ++i)
  422               return; // R2 zone is empty
  423           R2 = i;
  424       }
  425   
  426       /**
  427        * Checks if character is a vowel..
  428        * Creation date: (16/03/2002 10:47:03 PM)
  429        * @return boolean
  430        * @param letter char
  431        */
  432       private boolean isVowel(char letter)
  433       {
  434           for (int i = 0; i < vowels.length; i++)
  435           {
  436               if (letter == vowels[i])
  437                   return true;
  438           }
  439           return false;
  440       }
  441   
  442       /**
  443        * Noun endings.
  444        * Creation date: (17/03/2002 12:14:58 AM)
  445        * @param stemmingZone java.lang.StringBuilder
  446        */
  447       private boolean noun(StringBuilder stemmingZone)
  448       {
  449           return findAndRemoveEnding(stemmingZone, nounEndings);
  450       }
  451   
  452       /**
  453        * Perfective gerund endings.
  454        * Creation date: (17/03/2002 12:14:58 AM)
  455        * @param stemmingZone java.lang.StringBuilder
  456        */
  457       private boolean perfectiveGerund(StringBuilder stemmingZone)
  458       {
  459           return findAndRemoveEnding(
  460               stemmingZone,
  461               perfectiveGerundEndings1,
  462               perfectiveGerund1Predessors)
  463               || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
  464       }
  465   
  466       /**
  467        * Reflexive endings.
  468        * Creation date: (17/03/2002 12:14:58 AM)
  469        * @param stemmingZone java.lang.StringBuilder
  470        */
  471       private boolean reflexive(StringBuilder stemmingZone)
  472       {
  473           return findAndRemoveEnding(stemmingZone, reflexiveEndings);
  474       }
  475   
  476       /**
  477        * Insert the method's description here.
  478        * Creation date: (17/03/2002 12:14:58 AM)
  479        * @param stemmingZone java.lang.StringBuilder
  480        */
  481       private boolean removeI(StringBuilder stemmingZone)
  482       {
  483           if (stemmingZone.length() > 0
  484               && stemmingZone.charAt(stemmingZone.length() - 1) == I)
  485           {
  486               stemmingZone.setLength(stemmingZone.length() - 1);
  487               return true;
  488           }
  489           else
  490           {
  491               return false;
  492           }
  493       }
  494   
  495       /**
  496        * Insert the method's description here.
  497        * Creation date: (17/03/2002 12:14:58 AM)
  498        * @param stemmingZone java.lang.StringBuilder
  499        */
  500       private boolean removeSoft(StringBuilder stemmingZone)
  501       {
  502           if (stemmingZone.length() > 0
  503               && stemmingZone.charAt(stemmingZone.length() - 1) == SOFT)
  504           {
  505               stemmingZone.setLength(stemmingZone.length() - 1);
  506               return true;
  507           }
  508           else
  509           {
  510               return false;
  511           }
  512       }
  513   
  514       /**
  515        * Finds the stem for given Russian word.
  516        * Creation date: (16/03/2002 3:36:48 PM)
  517        * @return java.lang.String
  518        * @param input java.lang.String
  519        */
  520       public String stem(String input)
  521       {
  522           markPositions(input);
  523           if (RV == 0)
  524               return input; //RV wasn't detected, nothing to stem
  525           StringBuilder stemmingZone = new StringBuilder(input.substring(RV));
  526           // stemming goes on in RV
  527           // Step 1
  528   
  529           if (!perfectiveGerund(stemmingZone))
  530           {
  531               reflexive(stemmingZone);
  532               // variable r is unused, we are just interested in the flow that gets
  533               // created by logical expression: apply adjectival(); if that fails,
  534               // apply verb() etc
  535               boolean r =
  536                   adjectival(stemmingZone)
  537                   || verb(stemmingZone)
  538                   || noun(stemmingZone);
  539           }
  540           // Step 2
  541           removeI(stemmingZone);
  542           // Step 3
  543           derivational(stemmingZone);
  544           // Step 4
  545           superlative(stemmingZone);
  546           undoubleN(stemmingZone);
  547           removeSoft(stemmingZone);
  548           // return result
  549           return input.substring(0, RV) + stemmingZone.toString();
  550       }
  551   
  552       /**
  553        * Superlative endings.
  554        * Creation date: (17/03/2002 12:14:58 AM)
  555        * @param stemmingZone java.lang.StringBuilder
  556        */
  557       private boolean superlative(StringBuilder stemmingZone)
  558       {
  559           return findAndRemoveEnding(stemmingZone, superlativeEndings);
  560       }
  561   
  562       /**
  563        * Undoubles N.
  564        * Creation date: (17/03/2002 12:14:58 AM)
  565        * @param stemmingZone java.lang.StringBuilder
  566        */
  567       private boolean undoubleN(StringBuilder stemmingZone)
  568       {
  569           char[][] doubleN = {
  570               { N, N }
  571           };
  572           if (findEnding(stemmingZone, doubleN) != 0)
  573           {
  574               stemmingZone.setLength(stemmingZone.length() - 1);
  575               return true;
  576           }
  577           else
  578           {
  579               return false;
  580           }
  581       }
  582   
  583       /**
  584        * Verb endings.
  585        * Creation date: (17/03/2002 12:14:58 AM)
  586        * @param stemmingZone java.lang.StringBuilder
  587        */
  588       private boolean verb(StringBuilder stemmingZone)
  589       {
  590           return findAndRemoveEnding(
  591               stemmingZone,
  592               verbEndings1,
  593               verb1Predessors)
  594               || findAndRemoveEnding(stemmingZone, verbEndings2);
  595       }
  596      
  597       /**
  598        * Static method for stemming.
  599        */
  600       public static String stemWord(String theWord)
  601       {
  602           RussianStemmer stemmer = new RussianStemmer();
  603           return stemmer.stem(theWord);
  604       }
  605   }

Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » ru » [javadoc | source]