2323PG_MODULE_MAGIC ;
2424
2525/*
26- * Unaccent dictionary uses a trie to find a character to replace. Each node of
27- * the trie is an array of 256 TrieChar structs (n-th element of array
28- * corresponds to byte)
26+ * An unaccent dictionary uses a trie to find a string to replace. Each node
27+ * of the trie is an array of 256 TrieChar structs; the N-th element of the
28+ * array corresponds to next byte value N. That element can contain both a
29+ * replacement string (to be used if the source string ends with this byte)
30+ * and a link to another trie node (to be followed if there are more bytes).
31+ *
32+ * Note that the trie search logic pays no attention to multibyte character
33+ * boundaries. This is OK as long as both the data entered into the trie and
34+ * the data we're trying to look up are validly encoded; no partial-character
35+ * matches will occur.
2936 */
3037typedef struct TrieChar
3138{
@@ -36,34 +43,38 @@ typedef struct TrieChar
3643
3744/*
3845 * placeChar - put str into trie's structure, byte by byte.
46+ *
47+ * If node is NULL, we need to make a new node, which will be returned;
48+ * otherwise the return value is the same as node.
3949 */
4050static TrieChar *
41- placeChar (TrieChar * node , unsigned char * str , int lenstr , char * replaceTo , int replacelen )
51+ placeChar (TrieChar * node , const unsigned char * str , int lenstr ,
52+ const char * replaceTo , int replacelen )
4253{
4354 TrieChar * curnode ;
4455
4556 if (!node )
46- {
47- node = palloc (sizeof (TrieChar ) * 256 );
48- memset (node , 0 , sizeof (TrieChar ) * 256 );
49- }
57+ node = (TrieChar * ) palloc0 (sizeof (TrieChar ) * 256 );
58+
59+ Assert (lenstr > 0 ); /* else str[0] doesn't exist */
5060
5161 curnode = node + * str ;
5262
53- if (lenstr = = 1 )
63+ if (lenstr < = 1 )
5464 {
5565 if (curnode -> replaceTo )
56- elog (WARNING , "duplicate TO argument, use first one" );
66+ elog (WARNING , "duplicate source strings, first one will be used " );
5767 else
5868 {
5969 curnode -> replacelen = replacelen ;
60- curnode -> replaceTo = palloc (replacelen );
70+ curnode -> replaceTo = ( char * ) palloc (replacelen );
6171 memcpy (curnode -> replaceTo , replaceTo , replacelen );
6272 }
6373 }
6474 else
6575 {
66- curnode -> nextChar = placeChar (curnode -> nextChar , str + 1 , lenstr - 1 , replaceTo , replacelen );
76+ curnode -> nextChar = placeChar (curnode -> nextChar , str + 1 , lenstr - 1 ,
77+ replaceTo , replacelen );
6778 }
6879
6980 return node ;
@@ -213,23 +224,35 @@ initTrie(char *filename)
213224}
214225
215226/*
216- * findReplaceTo - find multibyte character in trie
227+ * findReplaceTo - find longest possible match in trie
228+ *
229+ * On success, returns pointer to ending subnode, plus length of matched
230+ * source string in *p_matchlen. On failure, returns NULL.
217231 */
218232static TrieChar *
219- findReplaceTo (TrieChar * node , unsigned char * src , int srclen )
233+ findReplaceTo (TrieChar * node , const unsigned char * src , int srclen ,
234+ int * p_matchlen )
220235{
221- while (node )
236+ TrieChar * result = NULL ;
237+ int matchlen = 0 ;
238+
239+ * p_matchlen = 0 ; /* prevent uninitialized-variable warnings */
240+
241+ while (node && matchlen < srclen )
222242 {
223- node = node + * src ;
224- if (srclen == 1 )
225- return node ;
243+ node = node + src [matchlen ];
244+ matchlen ++ ;
245+
246+ if (node -> replaceTo )
247+ {
248+ result = node ;
249+ * p_matchlen = matchlen ;
250+ }
226251
227- src ++ ;
228- srclen -- ;
229252 node = node -> nextChar ;
230253 }
231254
232- return NULL ;
255+ return result ;
233256}
234257
235258PG_FUNCTION_INFO_V1 (unaccent_init );
@@ -280,18 +303,17 @@ unaccent_lexize(PG_FUNCTION_ARGS)
280303 TrieChar * rootTrie = (TrieChar * ) PG_GETARG_POINTER (0 );
281304 char * srcchar = (char * ) PG_GETARG_POINTER (1 );
282305 int32 len = PG_GETARG_INT32 (2 );
283- char * srcstart ,
306+ char * srcstart = srcchar ,
284307 * trgchar = NULL ;
285- int charlen ;
286308 TSLexeme * res = NULL ;
287- TrieChar * node ;
288309
289- srcstart = srcchar ;
290- while (srcchar - srcstart < len )
310+ while (len > 0 )
291311 {
292- charlen = pg_mblen (srcchar );
312+ TrieChar * node ;
313+ int matchlen ;
293314
294- node = findReplaceTo (rootTrie , (unsigned char * ) srcchar , charlen );
315+ node = findReplaceTo (rootTrie , (unsigned char * ) srcchar , len ,
316+ & matchlen );
295317 if (node && node -> replaceTo )
296318 {
297319 if (!res )
@@ -309,13 +331,18 @@ unaccent_lexize(PG_FUNCTION_ARGS)
309331 memcpy (trgchar , node -> replaceTo , node -> replacelen );
310332 trgchar += node -> replacelen ;
311333 }
312- else if ( res )
334+ else
313335 {
314- memcpy (trgchar , srcchar , charlen );
315- trgchar += charlen ;
336+ matchlen = pg_mblen (srcchar );
337+ if (res )
338+ {
339+ memcpy (trgchar , srcchar , matchlen );
340+ trgchar += matchlen ;
341+ }
316342 }
317343
318- srcchar += charlen ;
344+ srcchar += matchlen ;
345+ len -= matchlen ;
319346 }
320347
321348 if (res )
0 commit comments