1C-Bitrix 25.700.0
Загрузка...
Поиск...
Не найдено
stemming.php
См. документацию.
1<?php
2
3function stemming_init($sLang='ru')
4{
5 static $arStemFunc = false;
6
7 //Init all languages
8 if ($arStemFunc === false)
9 {
10 $arStemFunc = [];
11 $rsLanguages = CLanguage::GetList();
12 while ($arLanguage = $rsLanguages->Fetch())
13 {
14 stemming_init($arLanguage['LID']);
15 }
16 }
17
18 //Check if language was not used
19 if ($sLang !== false && !isset($arStemFunc[$sLang]))
20 {
21 $stemming_function_suf = $sLang;
22
23 if (!function_exists('stemming_' . $sLang))
24 {
25 $strFileName = $_SERVER['DOCUMENT_ROOT'] . BX_PERSONAL_ROOT . '/php_interface/' . $sLang . '/search/stemming.php';
26 if (file_exists($strFileName))
27 {
28 @include $strFileName;
29 }
30 if (!function_exists('stemming_' . $sLang))
31 {
32 $strFileName = $_SERVER['DOCUMENT_ROOT'] . '/bitrix/modules/search/tools/' . $sLang . '/stemming.php';
33 if (file_exists($strFileName))
34 {
35 if (\Bitrix\Main\Localization\Translation::allowConvertEncoding())
36 {
37 \Bitrix\Main\Localization\StreamConverter::include($strFileName, $sLang);
38 }
39 else
40 {
41 @include $strFileName;
42 }
43 }
44 if (!function_exists('stemming_' . $sLang))
45 {
46 $stemming_function_suf = 'default';
47 }
48 }
49 }
50
51 $stemming_stop_function = 'stemming_stop_' . $sLang;
52 if (!function_exists($stemming_stop_function))
53 {
54 $stemming_stop_function = 'stemming_stop_default';
55 }
56
57 $stemming_upper_function = 'stemming_upper_' . $sLang;
58 if (!function_exists($stemming_upper_function))
59 {
60 $stemming_upper_function = 'stemming_upper_default';
61 }
62
63 $letters = stemming_letter_default();
64 $stemming_letter_function = 'stemming_letter_' . $sLang;
65 if (function_exists($stemming_letter_function))
66 {
67 $letters .= $stemming_letter_function();
68 }
69 // Do not use CPageOption feature in the real project. This is for unit tests only.
70 $letters .= CPageOption::GetOptionString('search', 'letters') ?: COption::GetOptionString('search', 'letters');
71
72 if (function_exists($stemming_letter_function))
73 {
74 $abc = $stemming_letter_function();
75 }
76 else
77 {
78 $abc = '';
79 }
80
81 if ($abc == '')
82 {
84 }
85
86 $arStemFunc[$sLang] = [
87 'stem' => 'stemming_' . $stemming_function_suf,
88 'stop' => $stemming_stop_function,
89 'upper' => $stemming_upper_function,
90 'letters' => $letters,
91 'pcre_letters' => '\\w\\d' . str_replace(
92 ['\\' , '-' , '^' , ']' , '/'],
93 ['\\\\', '\\-', '\\^', '\\]', '\\/'],
94 $letters
95 ),
96 'abc' => $abc,
97 'pcre_abc' => '\\w\\d' . str_replace(
98 ['\\' , '-' , '^' , ']' , '/'],
99 ['\\\\', '\\-', '\\^', '\\]', '\\/'],
100 $abc
101 ),
102 ];
103 }
104
105 if ($sLang === false)
106 {
107 return $arStemFunc;
108 }
109 else
110 {
111 return $arStemFunc[$sLang];
112 }
113}
114
115function stemming_upper($sText, $sLang='ru')
116{
117 $arStemFunc = stemming_init($sLang);
118 $upper_function = $arStemFunc['upper'];
119 return $upper_function($sText);
120}
121
122function stemming_split($sText, $sLang='ru')
123{
124 $arStemFunc = stemming_init($sLang);
125
126 $words = [];
127
128 $tok = ' ';
129 $sText = stemming_upper($sText, $sLang);
130 $sText = preg_replace('/[^' . $arStemFunc['pcre_letters'] . ']/u', $tok, $sText);
131
132 $word = strtok($sText, $tok);
133 while ($word !== false)
134 {
135 $word = mb_substr($word, 0, 100);
136
137 if (!isset($words[$word]))
138 {
139 $words[$word] = mb_strpos($sText, $word);
140 }
141
142 $word = strtok($tok);
143 }
144
145 return $words;
146}
147
148function stemming($sText, $sLang='ru', $bIgnoreStopWords = false, $bReturnPositions = false)
149{
150 static $STOP_CACHE = [];
151 if (!isset($STOP_CACHE[$sLang]))
152 {
153 $STOP_CACHE[$sLang] = [];
154 }
155 $stop_cache = &$STOP_CACHE[$sLang];
156
157 //Result
158 $stems = [];
159
160 //Get info about all languages
161 $arStemInfo = stemming_init(false);
162 //Add default functions if language was not defined
163 if (!isset($arStemInfo[$sLang]))
164 {
165 $arStemInfo[$sLang] = stemming_init($sLang);
166 }
167
168 $stem_func = $arStemInfo[$sLang]['stem'];
169 $pcre_abc = '/[^' . $arStemInfo[$sLang]['pcre_abc'] . ']+/u';
170
171 //Delimiter of the words
172 $tok = ' ';
173 $sText = stemming_upper($sText, $sLang);
174 if ($bReturnPositions)
175 {
176 $sText = preg_replace('/[^' . $arStemInfo[$sLang]['pcre_letters'] . '.!?]+/u', $tok, $sText);
177 $sText = preg_replace('/[!?]+/u', '.', $sText);
178 }
179 else
180 {
181 $sText = preg_replace('/[^' . $arStemInfo[$sLang]['pcre_letters'] . ']+/u', $tok, $sText);
182 }
183
184 //Parse text
185 $words = strtok($sText, $tok);
186 $pos = 1;
187 while ($words !== false)
188 {
189 if ($bReturnPositions)
190 {
191 $words = explode('.', $words);
192 }
193 else
194 {
195 $words = [$words];
196 }
197
198 foreach ($words as $i => $word)
199 {
200 $word = mb_substr($word, 0, 50);
201
202 if ($bReturnPositions)
203 {
204 if ($i > 0)
205 {
206 $pos += 5; //Sentence distance
207 }
208 if ($word == '')
209 {
210 continue;
211 }
212 }
213
214 //Try to stem starting with desired language
215 //1 - stemming may return more than one word
216 $stem = $stem_func($word, 1);
217 $stop_lang = $sLang;
218
219 //If word equals it's stemming
220 //and has letters not from ABC
221 if (
222 !is_array($stem)
223 && $stem === $word
224 && preg_match($pcre_abc, $word)
225 )
226 {
227 //Do the best to detect correct one
228 $guess = stemming_detect($word, $arStemInfo, $sLang);
229 if ($guess[0] <> '')
230 {
231 $stem = $guess[0];
232 $stop_lang = $guess[1];
233 }
234 }
235
236 if ($bIgnoreStopWords)
237 {
238 if (is_array($stem))
239 {
240 foreach ($stem as $st)
241 {
242 $stems[$st] = isset($stems[$st]) ? $stems[$st] + $pos : $pos;
243 }
244 }
245 else
246 {
247 $stems[$stem] = isset($stems[$stem]) ? $stems[$stem] + $pos : $pos;
248 }
249 }
250 else
251 {
252 $stop_func = $arStemInfo[$stop_lang]['stop'];
253 if (is_array($stem))
254 {
255 foreach ($stem as $st)
256 {
257 if (!isset($stop_cache[$st]))
258 {
259 $stop_cache[$st] = $stop_func($st);
260 }
261
262 if ($stop_cache[$st])
263 {
264 $stems[$st] = isset($stems[$st]) ? $stems[$st] + $pos : $pos;
265 }
266 }
267 }
268 else
269 {
270 if (!isset($stop_cache[$stem]))
271 {
272 $stop_cache[$stem] = $stop_func($stem);
273 }
274
275 if ($stop_cache[$stem])
276 {
277 $stems[$stem] = isset($stems[$stem]) ? $stems[$stem] + $pos : $pos;
278 }
279 }
280 }
281
282 if ($bReturnPositions)
283 {
284 $pos++;
285 }
286 }
287 //Next word
288 $words = strtok($tok);
289 }
290
291 return $stems;
292}
293
294function stemming_detect($word, $arStemInfo, $skipLang)
295{
296 $stem = '';
297 $lang = '';
298
299 foreach ($arStemInfo as $sGuessLang => $arInfo)
300 {
301 if ($sGuessLang === $skipLang)
302 {
303 continue;
304 }
305
306 //Word has letters not from ABC, so skip to next language
307 if (preg_match('/[^' . $arInfo['pcre_abc'] . ']+/u', $word))
308 {
309 continue;
310 }
311
312 $stem = $arInfo['stem']($word);
313 $lang = $sGuessLang;
314
315 //It looks like stemming succseeded
316 if ($stem !== $word)
317 {
318 break;
319 }
320
321 //Check if stop function flag word as stop
322 $stop_func = $arInfo['stop'];
323 if (!$stop_func($stem))
324 {
325 break;
326 }
327 }
328
329 //It' s the best we can do
330 //return word and lang to use as stop
331 return [$stem, $lang];
332}
333
335{
336 return mb_strtoupper($sText);
337}
338
339function stemming_default($sText)
340{
341 return $sText;
342}
343
344function stemming_stop_default($sWord)
345{
346 if (mb_strlen($sWord) < 2)
347 {
348 return false;
349 }
350 else
351 {
352 return true;
353 }
354}
355
357{
358 return 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM0123456789';
359}
static GetList($by="sort", $order="asc", $arFilter=[])
Определения language.php:12
static GetOptionString($module_id, $name, $def="", $site=false)
Определения pageoption.php:10
$arStemFunc
Определения get_search.php:35
$_SERVER["DOCUMENT_ROOT"]
Определения cron_frame.php:9
if(!defined('SITE_ID')) $lang
Определения include.php:91
$i
Определения factura.php:643
stemming_upper_default($sText)
Определения stemming.php:334
stemming_detect($word, $arStemInfo, $skipLang)
Определения stemming.php:294
stemming_split($sText, $sLang='ru')
Определения stemming.php:122
stemming_init($sLang='ru')
Определения stemming.php:3
stemming_stop_default($sWord)
Определения stemming.php:344
stemming_upper($sText, $sLang='ru')
Определения stemming.php:115
stemming_default($sText)
Определения stemming.php:339
stemming_letter_default()
Определения stemming.php:356
stemming($sText, $sLang='ru', $bIgnoreStopWords=false, $bReturnPositions=false)
Определения stemming.php:148