Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » ru » [javadoc | source]
    1   package org.apache.lucene.analysis.ru;
    2   /**
    3    * Licensed to the Apache Software Foundation (ASF) under one or more
    4    * contributor license agreements.  See the NOTICE file distributed with
    5    * this work for additional information regarding copyright ownership.
    6    * The ASF licenses this file to You under the Apache License, Version 2.0
    7    * (the "License"); you may not use this file except in compliance with
    8    * the License.  You may obtain a copy of the License at
    9    *
   10    *     http://www.apache.org/licenses/LICENSE-2.0
   11    *
   12    * Unless required by applicable law or agreed to in writing, software
   13    * distributed under the License is distributed on an "AS IS" BASIS,
   14    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   15    * See the License for the specific language governing permissions and
   16    * limitations under the License.
   17    */
   18   
   19   /**
   20    * RussianCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation
   21    * for russian characters in Unicode, KOI8 and CP1252.
   22    * Each encoding scheme contains lowercase (positions 0-31) and uppercase (position 32-63) characters.
   23    * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
   24    * and adding logic to toLowerCase() method for that charset.
   25    *
   26    *
   27    * @version $Id: RussianCharsets.java 656111 2008-05-14 05:37:45Z otis $
   28    */
   29   public class RussianCharsets
   30   {
   31       // Unicode Russian charset (lowercase letters only)
   32       public static char[] UnicodeRussian = {
   33           '\u0430',
   34           '\u0431',
   35           '\u0432',
   36           '\u0433',
   37           '\u0434',
   38           '\u0435',
   39           '\u0436',
   40           '\u0437',
   41           '\u0438',
   42           '\u0439',
   43           '\u043A',
   44           '\u043B',
   45           '\u043C',
   46           '\u043D',
   47           '\u043E',
   48           '\u043F',
   49           '\u0440',
   50           '\u0441',
   51           '\u0442',
   52           '\u0443',
   53           '\u0444',
   54           '\u0445',
   55           '\u0446',
   56           '\u0447',
   57           '\u0448',
   58           '\u0449',
   59           '\u044A',
   60           '\u044B',
   61           '\u044C',
   62           '\u044D',
   63           '\u044E',
   64           '\u044F',
   65           // upper case
   66           '\u0410',
   67           '\u0411',
   68           '\u0412',
   69           '\u0413',
   70           '\u0414',
   71           '\u0415',
   72           '\u0416',
   73           '\u0417',
   74           '\u0418',
   75           '\u0419',
   76           '\u041A',
   77           '\u041B',
   78           '\u041C',
   79           '\u041D',
   80           '\u041E',
   81           '\u041F',
   82           '\u0420',
   83           '\u0421',
   84           '\u0422',
   85           '\u0423',
   86           '\u0424',
   87           '\u0425',
   88           '\u0426',
   89           '\u0427',
   90           '\u0428',
   91           '\u0429',
   92           '\u042A',
   93           '\u042B',
   94           '\u042C',
   95           '\u042D',
   96           '\u042E',
   97           '\u042F',
   98           // numbers
   99           '0',
  100           '1',
  101           '2',
  102           '3',
  103           '4',
  104           '5',
  105           '6',
  106           '7',
  107           '8',
  108           '9'
  109       };
  110   
  111       // KOI8 charset
  112       public static char[] KOI8 = {
  113           0xc1,
  114           0xc2,
  115           0xd7,
  116           0xc7,
  117           0xc4,
  118           0xc5,
  119           0xd6,
  120           0xda,
  121           0xc9,
  122           0xca,
  123           0xcb,
  124           0xcc,
  125           0xcd,
  126           0xce,
  127           0xcf,
  128           0xd0,
  129           0xd2,
  130           0xd3,
  131           0xd4,
  132           0xd5,
  133           0xc6,
  134           0xc8,
  135           0xc3,
  136           0xde,
  137           0xdb,
  138           0xdd,
  139           0xdf,
  140           0xd9,
  141           0xd8,
  142           0xdc,
  143           0xc0,
  144           0xd1,
  145           // upper case
  146           0xe1,
  147           0xe2,
  148           0xf7,
  149           0xe7,
  150           0xe4,
  151           0xe5,
  152           0xf6,
  153           0xfa,
  154           0xe9,
  155           0xea,
  156           0xeb,
  157           0xec,
  158           0xed,
  159           0xee,
  160           0xef,
  161           0xf0,
  162           0xf2,
  163           0xf3,
  164           0xf4,
  165           0xf5,
  166           0xe6,
  167           0xe8,
  168           0xe3,
  169           0xfe,
  170           0xfb,
  171           0xfd,
  172           0xff,
  173           0xf9,
  174           0xf8,
  175           0xfc,
  176           0xe0,
  177           0xf1,
  178           // numbers
  179           '0',
  180           '1',
  181           '2',
  182           '3',
  183           '4',
  184           '5',
  185           '6',
  186           '7',
  187           '8',
  188           '9'
  189       };
  190   
  191       // CP1251 eharset
  192       public static char[] CP1251 = {
  193           0xE0,
  194           0xE1,
  195           0xE2,
  196           0xE3,
  197           0xE4,
  198           0xE5,
  199           0xE6,
  200           0xE7,
  201           0xE8,
  202           0xE9,
  203           0xEA,
  204           0xEB,
  205           0xEC,
  206           0xED,
  207           0xEE,
  208           0xEF,
  209           0xF0,
  210           0xF1,
  211           0xF2,
  212           0xF3,
  213           0xF4,
  214           0xF5,
  215           0xF6,
  216           0xF7,
  217           0xF8,
  218           0xF9,
  219           0xFA,
  220           0xFB,
  221           0xFC,
  222           0xFD,
  223           0xFE,
  224           0xFF,
  225           // upper case
  226           0xC0,
  227           0xC1,
  228           0xC2,
  229           0xC3,
  230           0xC4,
  231           0xC5,
  232           0xC6,
  233           0xC7,
  234           0xC8,
  235           0xC9,
  236           0xCA,
  237           0xCB,
  238           0xCC,
  239           0xCD,
  240           0xCE,
  241           0xCF,
  242           0xD0,
  243           0xD1,
  244           0xD2,
  245           0xD3,
  246           0xD4,
  247           0xD5,
  248           0xD6,
  249           0xD7,
  250           0xD8,
  251           0xD9,
  252           0xDA,
  253           0xDB,
  254           0xDC,
  255           0xDD,
  256           0xDE,
  257           0xDF,
  258           // numbers
  259           '0',
  260           '1',
  261           '2',
  262           '3',
  263           '4',
  264           '5',
  265           '6',
  266           '7',
  267           '8',
  268           '9'
  269       };
  270   
  271       public static char toLowerCase(char letter, char[] charset)
  272       {
  273           if (charset == UnicodeRussian)
  274           {
  275               if (letter >= '\u0430' && letter <= '\u044F')
  276               {
  277                   return letter;
  278               }
  279               if (letter >= '\u0410' && letter <= '\u042F')
  280               {
  281                   return (char) (letter + 32);
  282               }
  283           }
  284   
  285           if (charset == KOI8)
  286           {
  287               if (letter >= 0xe0 && letter <= 0xff)
  288               {
  289                   return (char) (letter - 32);
  290               }
  291               if (letter >= 0xc0 && letter <= 0xdf)
  292               {
  293                   return letter;
  294               }
  295   
  296           }
  297   
  298           if (charset == CP1251)
  299           {
  300               if (letter >= 0xC0 && letter <= 0xDF)
  301               {
  302                   return (char) (letter + 32);
  303               }
  304               if (letter >= 0xE0 && letter <= 0xFF)
  305               {
  306                   return letter;
  307               }
  308   
  309           }
  310   
  311           return Character.toLowerCase(letter);
  312       }
  313   }

Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » ru » [javadoc | source]