Skip to content

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit ea1db8a

Browse filesBrowse files
committed
Canonicalize ICU locale names to language tags.
Convert to BCP47 language tags before storing in the catalog, except during binary upgrade or when the locale comes from an existing collation or template database. The resulting language tags can vary slightly between ICU versions. For instance, "@colBackwards=yes" is converted to "und-u-kb-true" in older versions of ICU, and to the simpler (but equivalent) "und-u-kb" in newer versions. The process of canonicalizing to a language tag also understands more input locale string formats than ucol_open(). For instance, "fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is ignored; effectively treating it the same as the locale "fr" and opening the wrong collator. Canonicalization properly interprets the language and region, resulting in the language tag "fr-CA", which can then be understood by ucol_open(). This commit fixes a problem in prior versions due to ucol_open() misinterpreting locale strings as described above. For instance, creating an ICU collation with locale "fr_CA.UTF-8" would store that string directly in the catalog, which would later be passed to (and misinterpreted by) ucol_open(). After this commit, the locale string will be canonicalized to language tag "fr-CA" in the catalog, which will be properly understood by ucol_open(). Because this fix affects the resulting collator, we cannot change the locale string stored in the catalog for existing databases or collations; otherwise we'd risk corrupting indexes. Therefore, only canonicalize locales for newly-created (not upgraded) collations/databases. For similar reasons, do not backport. Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com Reviewed-by: Peter Eisentraut
1 parent d3d53f9 commit ea1db8a
Copy full SHA for ea1db8a

File tree

10 files changed

+258
-27
lines changed
Filter options

10 files changed

+258
-27
lines changed

‎doc/src/sgml/charset.sgml

Copy file name to clipboardExpand all lines: doc/src/sgml/charset.sgml
+1-1
Original file line numberDiff line numberDiff line change
@@ -893,7 +893,7 @@ CREATE COLLATION german (provider = libc, locale = 'de_DE');
893893
The first example selects the ICU locale using a <quote>language
894894
tag</quote> per BCP 47. The second example uses the traditional
895895
ICU-specific locale syntax. The first style is preferred going
896-
forward, but it is not supported by older ICU versions.
896+
forward, and is used internally to store locales.
897897
</para>
898898
<para>
899899
Note that you can name the collation objects in the SQL environment

‎src/backend/commands/collationcmds.c

Copy file name to clipboardExpand all lines: src/backend/commands/collationcmds.c
+25-21
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,11 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
165165
else
166166
colliculocale = NULL;
167167

168+
/*
169+
* When the ICU locale comes from an existing collation, do not
170+
* canonicalize to a language tag.
171+
*/
172+
168173
datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
169174
if (!isnull)
170175
collicurules = TextDatumGetCString(datum);
@@ -259,6 +264,25 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
259264
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
260265
errmsg("parameter \"locale\" must be specified")));
261266

267+
/*
268+
* During binary upgrade, preserve the locale string. Otherwise,
269+
* canonicalize to a language tag.
270+
*/
271+
if (!IsBinaryUpgrade)
272+
{
273+
char *langtag = icu_language_tag(colliculocale,
274+
icu_validation_level);
275+
276+
if (langtag && strcmp(colliculocale, langtag) != 0)
277+
{
278+
ereport(NOTICE,
279+
(errmsg("using standard form \"%s\" for locale \"%s\"",
280+
langtag, colliculocale)));
281+
282+
colliculocale = langtag;
283+
}
284+
}
285+
262286
icu_validate_locale(colliculocale);
263287
}
264288

@@ -569,26 +593,6 @@ cmpaliases(const void *a, const void *b)
569593

570594

571595
#ifdef USE_ICU
572-
/*
573-
* Get the ICU language tag for a locale name.
574-
* The result is a palloc'd string.
575-
*/
576-
static char *
577-
get_icu_language_tag(const char *localename)
578-
{
579-
char buf[ULOC_FULLNAME_CAPACITY];
580-
UErrorCode status;
581-
582-
status = U_ZERO_ERROR;
583-
uloc_toLanguageTag(localename, buf, sizeof(buf), true, &status);
584-
if (U_FAILURE(status))
585-
ereport(ERROR,
586-
(errmsg("could not convert locale name \"%s\" to language tag: %s",
587-
localename, u_errorName(status))));
588-
589-
return pstrdup(buf);
590-
}
591-
592596
/*
593597
* Get a comment (specifically, the display name) for an ICU locale.
594598
* The result is a palloc'd string, or NULL if we can't get a comment
@@ -950,7 +954,7 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
950954
else
951955
name = uloc_getAvailable(i);
952956

953-
langtag = get_icu_language_tag(name);
957+
langtag = icu_language_tag(name, ERROR);
954958

955959
/*
956960
* Be paranoid about not allowing any non-ASCII strings into

‎src/backend/commands/dbcommands.c

Copy file name to clipboardExpand all lines: src/backend/commands/dbcommands.c
+20
Original file line numberDiff line numberDiff line change
@@ -1058,6 +1058,26 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
10581058
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10591059
errmsg("ICU locale must be specified")));
10601060

1061+
/*
1062+
* During binary upgrade, or when the locale came from the template
1063+
* database, preserve locale string. Otherwise, canonicalize to a
1064+
* language tag.
1065+
*/
1066+
if (!IsBinaryUpgrade && dbiculocale != src_iculocale)
1067+
{
1068+
char *langtag = icu_language_tag(dbiculocale,
1069+
icu_validation_level);
1070+
1071+
if (langtag && strcmp(dbiculocale, langtag) != 0)
1072+
{
1073+
ereport(NOTICE,
1074+
(errmsg("using standard form \"%s\" for locale \"%s\"",
1075+
langtag, dbiculocale)));
1076+
1077+
dbiculocale = langtag;
1078+
}
1079+
}
1080+
10611081
icu_validate_locale(dbiculocale);
10621082
}
10631083
else

‎src/backend/utils/adt/pg_locale.c

Copy file name to clipboardExpand all lines: src/backend/utils/adt/pg_locale.c
+85
Original file line numberDiff line numberDiff line change
@@ -2826,6 +2826,91 @@ icu_set_collation_attributes(UCollator *collator, const char *loc,
28262826

28272827
#endif
28282828

2829+
/*
2830+
* Return the BCP47 language tag representation of the requested locale.
2831+
*
2832+
* This function should be called before passing the string to ucol_open(),
2833+
* because conversion to a language tag also performs "level 2
2834+
* canonicalization". In addition to producing a consistent format, level 2
2835+
* canonicalization is able to more accurately interpret different input
2836+
* locale string formats, such as POSIX and .NET IDs.
2837+
*/
2838+
char *
2839+
icu_language_tag(const char *loc_str, int elevel)
2840+
{
2841+
#ifdef USE_ICU
2842+
UErrorCode status;
2843+
char lang[ULOC_LANG_CAPACITY];
2844+
char *langtag;
2845+
size_t buflen = 32; /* arbitrary starting buffer size */
2846+
const bool strict = true;
2847+
2848+
status = U_ZERO_ERROR;
2849+
uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
2850+
if (U_FAILURE(status))
2851+
{
2852+
if (elevel > 0)
2853+
ereport(elevel,
2854+
(errmsg("could not get language from locale \"%s\": %s",
2855+
loc_str, u_errorName(status))));
2856+
return NULL;
2857+
}
2858+
2859+
/* C/POSIX locales aren't handled by uloc_getLanguageTag() */
2860+
if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
2861+
return pstrdup("en-US-u-va-posix");
2862+
2863+
/*
2864+
* A BCP47 language tag doesn't have a clearly-defined upper limit
2865+
* (cf. RFC5646 section 4.4). Additionally, in older ICU versions,
2866+
* uloc_toLanguageTag() doesn't always return the ultimate length on the
2867+
* first call, necessitating a loop.
2868+
*/
2869+
langtag = palloc(buflen);
2870+
while (true)
2871+
{
2872+
int32_t len;
2873+
2874+
status = U_ZERO_ERROR;
2875+
len = uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status);
2876+
2877+
/*
2878+
* If the result fits in the buffer exactly (len == buflen),
2879+
* uloc_toLanguageTag() will return success without nul-terminating
2880+
* the result. Check for either U_BUFFER_OVERFLOW_ERROR or len >=
2881+
* buflen and try again.
2882+
*/
2883+
if ((status == U_BUFFER_OVERFLOW_ERROR ||
2884+
(U_SUCCESS(status) && len >= buflen)) &&
2885+
buflen < MaxAllocSize)
2886+
{
2887+
buflen = Min(buflen * 2, MaxAllocSize);
2888+
langtag = repalloc(langtag, buflen);
2889+
continue;
2890+
}
2891+
2892+
break;
2893+
}
2894+
2895+
if (U_FAILURE(status))
2896+
{
2897+
pfree(langtag);
2898+
2899+
if (elevel > 0)
2900+
ereport(elevel,
2901+
(errmsg("could not convert locale name \"%s\" to language tag: %s",
2902+
loc_str, u_errorName(status))));
2903+
return NULL;
2904+
}
2905+
2906+
return langtag;
2907+
#else /* not USE_ICU */
2908+
ereport(ERROR,
2909+
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2910+
errmsg("ICU is not supported in this build")));
2911+
#endif /* not USE_ICU */
2912+
}
2913+
28292914
/*
28302915
* Perform best-effort check that the locale is a valid one.
28312916
*/

‎src/bin/initdb/initdb.c

Copy file name to clipboardExpand all lines: src/bin/initdb/initdb.c
+81
Original file line numberDiff line numberDiff line change
@@ -2229,6 +2229,78 @@ check_icu_locale_encoding(int user_enc)
22292229
return true;
22302230
}
22312231

2232+
/*
2233+
* Convert to canonical BCP47 language tag. Must be consistent with
2234+
* icu_language_tag().
2235+
*/
2236+
static char *
2237+
icu_language_tag(const char *loc_str)
2238+
{
2239+
#ifdef USE_ICU
2240+
UErrorCode status;
2241+
char lang[ULOC_LANG_CAPACITY];
2242+
char *langtag;
2243+
size_t buflen = 32; /* arbitrary starting buffer size */
2244+
const bool strict = true;
2245+
2246+
status = U_ZERO_ERROR;
2247+
uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
2248+
if (U_FAILURE(status))
2249+
{
2250+
pg_fatal("could not get language from locale \"%s\": %s",
2251+
loc_str, u_errorName(status));
2252+
return NULL;
2253+
}
2254+
2255+
/* C/POSIX locales aren't handled by uloc_getLanguageTag() */
2256+
if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
2257+
return pstrdup("en-US-u-va-posix");
2258+
2259+
/*
2260+
* A BCP47 language tag doesn't have a clearly-defined upper limit
2261+
* (cf. RFC5646 section 4.4). Additionally, in older ICU versions,
2262+
* uloc_toLanguageTag() doesn't always return the ultimate length on the
2263+
* first call, necessitating a loop.
2264+
*/
2265+
langtag = pg_malloc(buflen);
2266+
while (true)
2267+
{
2268+
int32_t len;
2269+
2270+
status = U_ZERO_ERROR;
2271+
len = uloc_toLanguageTag(loc_str, langtag, buflen, strict, &status);
2272+
2273+
/*
2274+
* If the result fits in the buffer exactly (len == buflen),
2275+
* uloc_toLanguageTag() will return success without nul-terminating
2276+
* the result. Check for either U_BUFFER_OVERFLOW_ERROR or len >=
2277+
* buflen and try again.
2278+
*/
2279+
if (status == U_BUFFER_OVERFLOW_ERROR ||
2280+
(U_SUCCESS(status) && len >= buflen))
2281+
{
2282+
buflen = buflen * 2;
2283+
langtag = pg_realloc(langtag, buflen);
2284+
continue;
2285+
}
2286+
2287+
break;
2288+
}
2289+
2290+
if (U_FAILURE(status))
2291+
{
2292+
pg_free(langtag);
2293+
2294+
pg_fatal("could not convert locale name \"%s\" to language tag: %s",
2295+
loc_str, u_errorName(status));
2296+
}
2297+
2298+
return langtag;
2299+
#else
2300+
pg_fatal("ICU is not supported in this build");
2301+
#endif
2302+
}
2303+
22322304
/*
22332305
* Perform best-effort check that the locale is a valid one. Should be
22342306
* consistent with pg_locale.c, except that it doesn't need to open the
@@ -2376,13 +2448,22 @@ setlocales(void)
23762448

23772449
if (locale_provider == COLLPROVIDER_ICU)
23782450
{
2451+
char *langtag;
2452+
23792453
/* acquire default locale from the environment, if not specified */
23802454
if (icu_locale == NULL)
23812455
{
23822456
icu_locale = default_icu_locale();
23832457
printf(_("Using default ICU locale \"%s\".\n"), icu_locale);
23842458
}
23852459

2460+
/* canonicalize to a language tag */
2461+
langtag = icu_language_tag(icu_locale);
2462+
printf(_("Using language tag \"%s\" for ICU locale \"%s\".\n"),
2463+
langtag, icu_locale);
2464+
pg_free(icu_locale);
2465+
icu_locale = langtag;
2466+
23862467
icu_validate_locale(icu_locale);
23872468

23882469
/*

‎src/bin/initdb/t/001_initdb.pl

Copy file name to clipboardExpand all lines: src/bin/initdb/t/001_initdb.pl
+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@
144144
'--locale-provider=icu',
145145
'--icu-locale=@colNumeric=lower', "$tempdir/dataX"
146146
],
147-
qr/could not open collator for locale "\@colNumeric=lower": U_ILLEGAL_ARGUMENT_ERROR/,
147+
qr/could not open collator for locale "und-u-kn-lower": U_ILLEGAL_ARGUMENT_ERROR/,
148148
'fails for invalid collation argument');
149149
}
150150
else

‎src/bin/pg_dump/t/002_pg_dump.pl

Copy file name to clipboardExpand all lines: src/bin/pg_dump/t/002_pg_dump.pl
+2-2
Original file line numberDiff line numberDiff line change
@@ -1860,9 +1860,9 @@
18601860

18611861
'CREATE COLLATION icu_collation' => {
18621862
create_order => 76,
1863-
create_sql => "CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'C');",
1863+
create_sql => "CREATE COLLATION icu_collation (PROVIDER = icu, LOCALE = 'en-US-u-va-posix');",
18641864
regexp =>
1865-
qr/CREATE COLLATION public.icu_collation \(provider = icu, locale = 'C'(, version = '[^']*')?\);/m,
1865+
qr/CREATE COLLATION public.icu_collation \(provider = icu, locale = 'en-US-u-va-posix'(, version = '[^']*')?\);/m,
18661866
icu => 1,
18671867
like => { %full_runs, section_pre_data => 1, },
18681868
},

‎src/include/utils/pg_locale.h

Copy file name to clipboardExpand all lines: src/include/utils/pg_locale.h
+1
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
120120
size_t srclen, pg_locale_t locale);
121121

122122
extern void icu_validate_locale(const char *loc_str);
123+
extern char *icu_language_tag(const char *loc_str, int elevel);
123124

124125
#ifdef USE_ICU
125126
extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes);

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.