1- /*
2- * This file contains public functions for conversion between
3- * client encoding and server (database) encoding.
1+ /*-------------------------------------------------------------------------
2+ *
3+ * mbutils.c
4+ * This file contains functions for encoding conversion.
5+ *
6+ * The string-conversion functions in this file share some API quirks.
7+ * Note the following:
8+ *
9+ * The functions return a palloc'd, null-terminated string if conversion
10+ * is required. However, if no conversion is performed, the given source
11+ * string pointer is returned as-is.
12+ *
13+ * Although the presence of a length argument means that callers can pass
14+ * non-null-terminated strings, care is required because the same string
15+ * will be passed back if no conversion occurs. Such callers *must* check
16+ * whether result == src and handle that case differently.
17+ *
18+ * If the source and destination encodings are the same, the source string
19+ * is returned without any verification; it's assumed to be valid data.
20+ * If that might not be the case, the caller is responsible for validating
21+ * the string using a separate call to pg_verify_mbstr(). Whenever the
22+ * source and destination encodings are different, the functions ensure that
23+ * the result is validly encoded according to the destination encoding.
24+ *
25+ *
26+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
27+ * Portions Copyright (c) 1994, Regents of the University of California
428 *
5- * Tatsuo Ishii
629 *
7- * src/backend/utils/mb/mbutils.c
30+ * IDENTIFICATION
31+ * src/backend/utils/mb/mbutils.c
32+ *
33+ *-------------------------------------------------------------------------
834 */
935#include "postgres.h"
1036
@@ -290,7 +316,6 @@ InitializeClientEncoding(void)
290316int
291317pg_get_client_encoding (void )
292318{
293- Assert (ClientEncoding );
294319 return ClientEncoding -> encoding ;
295320}
296321
@@ -300,29 +325,13 @@ pg_get_client_encoding(void)
300325const char *
301326pg_get_client_encoding_name (void )
302327{
303- Assert (ClientEncoding );
304328 return ClientEncoding -> name ;
305329}
306330
307331/*
308- * Apply encoding conversion on src and return it. The encoding
309- * conversion function is chosen from the pg_conversion system catalog
310- * marked as "default". If it is not found in the schema search path,
311- * it's taken from pg_catalog schema. If it even is not in the schema,
312- * warn and return src.
313- *
314- * If conversion occurs, a palloc'd null-terminated string is returned.
315- * In the case of no conversion, src is returned.
332+ * Convert src string to another encoding (general case).
316333 *
317- * CAUTION: although the presence of a length argument means that callers
318- * can pass non-null-terminated strings, care is required because the same
319- * string will be passed back if no conversion occurs. Such callers *must*
320- * check whether result == src and handle that case differently.
321- *
322- * Note: we try to avoid raising error, since that could get us into
323- * infinite recursion when this function is invoked during error message
324- * sending. It should be OK to raise error for overlength strings though,
325- * since the recursion will come with a shorter message.
334+ * See the notes about string conversion functions at the top of this file.
326335 */
327336unsigned char *
328337pg_do_encoding_conversion (unsigned char * src , int len ,
@@ -331,39 +340,32 @@ pg_do_encoding_conversion(unsigned char *src, int len,
331340 unsigned char * result ;
332341 Oid proc ;
333342
334- if (! IsTransactionState () )
335- return src ;
343+ if (len <= 0 )
344+ return src ; /* empty string is always valid */
336345
337346 if (src_encoding == dest_encoding )
338- return src ;
347+ return src ; /* no conversion required, assume valid */
339348
340- if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII )
341- return src ;
349+ if (dest_encoding == PG_SQL_ASCII )
350+ return src ; /* any string is valid in SQL_ASCII */
342351
343- if (len <= 0 )
352+ if (src_encoding == PG_SQL_ASCII )
353+ {
354+ /* No conversion is possible, but we must validate the result */
355+ (void ) pg_verify_mbstr (dest_encoding , (const char * ) src , len , false);
344356 return src ;
357+ }
358+
359+ if (!IsTransactionState ()) /* shouldn't happen */
360+ elog (ERROR , "cannot perform encoding conversion outside a transaction" );
345361
346362 proc = FindDefaultConversionProc (src_encoding , dest_encoding );
347363 if (!OidIsValid (proc ))
348- {
349- ereport (LOG ,
364+ ereport (ERROR ,
350365 (errcode (ERRCODE_UNDEFINED_FUNCTION ),
351366 errmsg ("default conversion function for encoding \"%s\" to \"%s\" does not exist" ,
352367 pg_encoding_to_char (src_encoding ),
353368 pg_encoding_to_char (dest_encoding ))));
354- return src ;
355- }
356-
357- /*
358- * XXX we should avoid throwing errors in OidFunctionCall. Otherwise we
359- * are going into infinite loop! So we have to make sure that the
360- * function exists before calling OidFunctionCall.
361- */
362- if (!SearchSysCacheExists1 (PROCOID , ObjectIdGetDatum (proc )))
363- {
364- elog (LOG , "cache lookup failed for function %u" , proc );
365- return src ;
366- }
367369
368370 /*
369371 * Allocate space for conversion result, being wary of integer overflow
@@ -387,7 +389,7 @@ pg_do_encoding_conversion(unsigned char *src, int len,
387389}
388390
389391/*
390- * Convert string using encoding_name. The source
392+ * Convert string to encoding encoding_name. The source
391393 * encoding is the DB encoding.
392394 *
393395 * BYTEA convert_to(TEXT string, NAME encoding_name) */
@@ -412,7 +414,7 @@ pg_convert_to(PG_FUNCTION_ARGS)
412414}
413415
414416/*
415- * Convert string using encoding_name. The destination
417+ * Convert string from encoding encoding_name. The destination
416418 * encoding is the DB encoding.
417419 *
418420 * TEXT convert_from(BYTEA string, NAME encoding_name) */
@@ -439,7 +441,7 @@ pg_convert_from(PG_FUNCTION_ARGS)
439441}
440442
441443/*
442- * Convert string using encoding_names .
444+ * Convert string between two arbitrary encodings .
443445 *
444446 * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
445447 */
@@ -472,8 +474,13 @@ pg_convert(PG_FUNCTION_ARGS)
472474 src_str = VARDATA_ANY (string );
473475 pg_verify_mbstr_len (src_encoding , src_str , len , false);
474476
475- dest_str = (char * ) pg_do_encoding_conversion (
476- (unsigned char * ) src_str , len , src_encoding , dest_encoding );
477+ /* perform conversion */
478+ dest_str = (char * ) pg_do_encoding_conversion ((unsigned char * ) src_str ,
479+ len ,
480+ src_encoding ,
481+ dest_encoding );
482+
483+ /* update len if conversion actually happened */
477484 if (dest_str != src_str )
478485 len = strlen (dest_str );
479486
@@ -503,10 +510,11 @@ pg_convert(PG_FUNCTION_ARGS)
503510Datum
504511length_in_encoding (PG_FUNCTION_ARGS )
505512{
506- bytea * string = PG_GETARG_BYTEA_P (0 );
513+ bytea * string = PG_GETARG_BYTEA_PP (0 );
507514 char * src_encoding_name = NameStr (* PG_GETARG_NAME (1 ));
508515 int src_encoding = pg_char_to_encoding (src_encoding_name );
509- int len = VARSIZE (string ) - VARHDRSZ ;
516+ const char * src_str ;
517+ int len ;
510518 int retval ;
511519
512520 if (src_encoding < 0 )
@@ -515,11 +523,19 @@ length_in_encoding(PG_FUNCTION_ARGS)
515523 errmsg ("invalid encoding name \"%s\"" ,
516524 src_encoding_name )));
517525
518- retval = pg_verify_mbstr_len (src_encoding , VARDATA (string ), len , false);
519- PG_RETURN_INT32 (retval );
526+ len = VARSIZE_ANY_EXHDR (string );
527+ src_str = VARDATA_ANY (string );
528+
529+ retval = pg_verify_mbstr_len (src_encoding , src_str , len , false);
520530
531+ PG_RETURN_INT32 (retval );
521532}
522533
534+ /*
535+ * Get maximum multibyte character length in the specified encoding.
536+ *
537+ * Note encoding is specified numerically, not by name as above.
538+ */
523539Datum
524540pg_encoding_max_length_sql (PG_FUNCTION_ARGS )
525541{
@@ -532,27 +548,31 @@ pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
532548}
533549
534550/*
535- * convert client encoding to server encoding.
551+ * Convert client encoding to server encoding.
552+ *
553+ * See the notes about string conversion functions at the top of this file.
536554 */
537555char *
538556pg_client_to_server (const char * s , int len )
539557{
540- Assert (ClientEncoding );
541-
542558 return pg_any_to_server (s , len , ClientEncoding -> encoding );
543559}
544560
545561/*
546- * convert any encoding to server encoding.
562+ * Convert any encoding to server encoding.
563+ *
564+ * See the notes about string conversion functions at the top of this file.
565+ *
566+ * Unlike the other string conversion functions, this will apply validation
567+ * even if encoding == DatabaseEncoding->encoding. This is because this is
568+ * used to process data coming in from outside the database, and we never
569+ * want to just assume validity.
547570 */
548571char *
549572pg_any_to_server (const char * s , int len , int encoding )
550573{
551- Assert (DatabaseEncoding );
552- Assert (ClientEncoding );
553-
554574 if (len <= 0 )
555- return (char * ) s ;
575+ return (char * ) s ; /* empty string is always valid */
556576
557577 if (encoding == DatabaseEncoding -> encoding ||
558578 encoding == PG_SQL_ASCII )
@@ -594,46 +614,59 @@ pg_any_to_server(const char *s, int len, int encoding)
594614 return (char * ) s ;
595615 }
596616
597- if (ClientEncoding -> encoding == encoding )
617+ /* Fast path if we can use cached conversion function */
618+ if (encoding == ClientEncoding -> encoding )
598619 return perform_default_encoding_conversion (s , len , true);
599- else
600- return (char * ) pg_do_encoding_conversion (
601- (unsigned char * ) s , len , encoding , DatabaseEncoding -> encoding );
620+
621+ /* General case ... will not work outside transactions */
622+ return (char * ) pg_do_encoding_conversion ((unsigned char * ) s ,
623+ len ,
624+ encoding ,
625+ DatabaseEncoding -> encoding );
602626}
603627
604628/*
605- * convert server encoding to client encoding.
629+ * Convert server encoding to client encoding.
630+ *
631+ * See the notes about string conversion functions at the top of this file.
606632 */
607633char *
608634pg_server_to_client (const char * s , int len )
609635{
610- Assert (ClientEncoding );
611-
612636 return pg_server_to_any (s , len , ClientEncoding -> encoding );
613637}
614638
615639/*
616- * convert server encoding to any encoding.
640+ * Convert server encoding to any encoding.
641+ *
642+ * See the notes about string conversion functions at the top of this file.
617643 */
618644char *
619645pg_server_to_any (const char * s , int len , int encoding )
620646{
621- Assert (DatabaseEncoding );
622- Assert (ClientEncoding );
623-
624647 if (len <= 0 )
625- return (char * ) s ;
648+ return (char * ) s ; /* empty string is always valid */
626649
627650 if (encoding == DatabaseEncoding -> encoding ||
628- encoding == PG_SQL_ASCII ||
629- DatabaseEncoding -> encoding == PG_SQL_ASCII )
651+ encoding == PG_SQL_ASCII )
630652 return (char * ) s ; /* assume data is valid */
631653
632- if (ClientEncoding -> encoding == encoding )
654+ if (DatabaseEncoding -> encoding == PG_SQL_ASCII )
655+ {
656+ /* No conversion is possible, but we must validate the result */
657+ (void ) pg_verify_mbstr (encoding , s , len , false);
658+ return (char * ) s ;
659+ }
660+
661+ /* Fast path if we can use cached conversion function */
662+ if (encoding == ClientEncoding -> encoding )
633663 return perform_default_encoding_conversion (s , len , false);
634- else
635- return (char * ) pg_do_encoding_conversion (
636- (unsigned char * ) s , len , DatabaseEncoding -> encoding , encoding );
664+
665+ /* General case ... will not work outside transactions */
666+ return (char * ) pg_do_encoding_conversion ((unsigned char * ) s ,
667+ len ,
668+ DatabaseEncoding -> encoding ,
669+ encoding );
637670}
638671
639672/*
@@ -643,7 +676,8 @@ pg_server_to_any(const char *s, int len, int encoding)
643676 * SetClientEncoding(), no conversion is performed.
644677 */
645678static char *
646- perform_default_encoding_conversion (const char * src , int len , bool is_client_to_server )
679+ perform_default_encoding_conversion (const char * src , int len ,
680+ bool is_client_to_server )
647681{
648682 char * result ;
649683 int src_encoding ,
@@ -931,11 +965,11 @@ raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
931965 * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
932966 * When that matches the database encoding, we don't need to do anything. In
933967 * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
934- * database encoding, except for the C locale. (On Windows, we also permit a
968+ * database encoding, except for the C locale. (On Windows, we also permit a
935969 * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
936970 * gettext to the right codeset.
937971 *
938- * On Windows, gettext defaults to the Windows ANSI code page. This is a
972+ * On Windows, gettext defaults to the Windows ANSI code page. This is a
939973 * convenient departure for software that passes the strings to Windows ANSI
940974 * APIs, but we don't do that. Compel gettext to use database encoding or,
941975 * failing that, the LC_CTYPE encoding as it would on other platforms.
@@ -980,28 +1014,24 @@ pg_bind_textdomain_codeset(const char *domainname)
9801014int
9811015GetDatabaseEncoding (void )
9821016{
983- Assert (DatabaseEncoding );
9841017 return DatabaseEncoding -> encoding ;
9851018}
9861019
9871020const char *
9881021GetDatabaseEncodingName (void )
9891022{
990- Assert (DatabaseEncoding );
9911023 return DatabaseEncoding -> name ;
9921024}
9931025
9941026Datum
9951027getdatabaseencoding (PG_FUNCTION_ARGS )
9961028{
997- Assert (DatabaseEncoding );
9981029 return DirectFunctionCall1 (namein , CStringGetDatum (DatabaseEncoding -> name ));
9991030}
10001031
10011032Datum
10021033pg_client_encoding (PG_FUNCTION_ARGS )
10031034{
1004- Assert (ClientEncoding );
10051035 return DirectFunctionCall1 (namein , CStringGetDatum (ClientEncoding -> name ));
10061036}
10071037
@@ -1014,7 +1044,6 @@ pg_client_encoding(PG_FUNCTION_ARGS)
10141044int
10151045GetMessageEncoding (void )
10161046{
1017- Assert (MessageEncoding );
10181047 return MessageEncoding -> encoding ;
10191048}
10201049
0 commit comments