1 package org.apache.lucene.analysis.ru;
2
3 /**
4 * Copyright 2004 The Apache Software Foundation
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19 import junit.framework.TestCase;
20
21 import java.io;
22
23 import org.apache.lucene.analysis.TokenStream;
24 import org.apache.lucene.analysis.Token;
25
26 /**
27 * Test case for RussianAnalyzer.
28 *
29 * @author Boris Okner
30 * @version $Id: TestRussianAnalyzer.java,v 1.6 2004/03/29 22:48:06 cutting Exp $
31 */
32
33 public class TestRussianAnalyzer extends TestCase
34 {
35 private InputStreamReader inWords;
36
37 private InputStreamReader sampleUnicode;
38
39 private Reader inWordsKOI8;
40
41 private Reader sampleKOI8;
42
43 private Reader inWords1251;
44
45 private Reader sample1251;
46
47 private File dataDir;
48
49 protected void setUp() throws Exception
50 {
51 dataDir = new File(System.getProperty("dataDir"));
52 }
53
54 public void testUnicode() throws IOException
55 {
56 RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
57 inWords =
58 new InputStreamReader(
59 new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUnicode.txt")),
60 "Unicode");
61
62 sampleUnicode =
63 new InputStreamReader(
64 new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resUnicode.htm")),
65 "Unicode");
66
67 TokenStream in = ra.tokenStream("all", inWords);
68
69 RussianLetterTokenizer sample =
70 new RussianLetterTokenizer(
71 sampleUnicode,
72 RussianCharsets.UnicodeRussian);
73
74 for (;;)
75 {
76 Token token = in.next();
77
78 if (token == null)
79 {
80 break;
81 }
82
83 Token sampleToken = sample.next();
84 assertEquals(
85 "Unicode",
86 token.termText(),
87 sampleToken == null
88 ? null
89 : sampleToken.termText());
90 }
91
92 inWords.close();
93 sampleUnicode.close();
94 }
95
96 public void testKOI8() throws IOException
97 {
98 //System.out.println(new java.util.Date());
99 RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.KOI8);
100 // KOI8
101 inWordsKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testKOI8.txt")), "iso-8859-1");
102
103 sampleKOI8 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/resKOI8.htm")), "iso-8859-1");
104
105 TokenStream in = ra.tokenStream("all", inWordsKOI8);
106 RussianLetterTokenizer sample =
107 new RussianLetterTokenizer(
108 sampleKOI8,
109 RussianCharsets.KOI8);
110
111 for (;;)
112 {
113 Token token = in.next();
114
115 if (token == null)
116 {
117 break;
118 }
119
120 Token sampleToken = sample.next();
121 assertEquals(
122 "KOI8",
123 token.termText(),
124 sampleToken == null
125 ? null
126 : sampleToken.termText());
127
128 }
129
130 inWordsKOI8.close();
131 sampleKOI8.close();
132 }
133
134 public void test1251() throws IOException
135 {
136 // 1251
137 inWords1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/test1251.txt")), "iso-8859-1");
138
139 sample1251 = new InputStreamReader(new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/res1251.htm")), "iso-8859-1");
140
141 RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.CP1251);
142 TokenStream in = ra.tokenStream("", inWords1251);
143 RussianLetterTokenizer sample =
144 new RussianLetterTokenizer(
145 sample1251,
146 RussianCharsets.CP1251);
147
148 for (;;)
149 {
150 Token token = in.next();
151
152 if (token == null)
153 {
154 break;
155 }
156
157 Token sampleToken = sample.next();
158 assertEquals(
159 "1251",
160 token.termText(),
161 sampleToken == null
162 ? null
163 : sampleToken.termText());
164
165 }
166
167 inWords1251.close();
168 sample1251.close();
169 }
170 }