Skip to content

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit a02b37f

Browse filesBrowse files
committed
Additional unicode primitive functions.
Introduce unicode_version(), icu_unicode_version(), and unicode_assigned(). The latter requires introducing a new lookup table for the Unicode General Category, which is generated along with the other Unicode lookup tables. Discussion: https://postgr.es/m/CA+TgmoYzYR-yhU6k1XFCADeyj=Oyz2PkVsa3iKv+keM8wp-F_A@mail.gmail.com Reviewed-by: Peter Eisentraut
1 parent 7021d3b commit a02b37f
Copy full SHA for a02b37f

18 files changed

+4924
-22
lines changed

‎doc/src/sgml/func.sgml

Copy file name to clipboardExpand all lines: doc/src/sgml/func.sgml
+90-19
Original file line numberDiff line numberDiff line change
@@ -2859,6 +2859,22 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
28592859
</para></entry>
28602860
</row>
28612861

2862+
<row>
2863+
<entry role="func_table_entry"><para role="func_signature">
2864+
<indexterm>
2865+
<primary>unicode_assigned</primary>
2866+
</indexterm>
2867+
<function>unicode_assigned</function> ( <type>text</type> )
2868+
<returnvalue>text</returnvalue>
2869+
</para>
2870+
<para>
2871+
Returns <literal>true</literal> if all characters in the string are
2872+
assigned Unicode codepoints; <literal>false</literal> otherwise. This
2873+
function can only be used when the server encoding is
2874+
<literal>UTF8</literal>.
2875+
</para></entry>
2876+
</row>
2877+
28622878
<row>
28632879
<entry role="func_table_entry"><para role="func_signature">
28642880
<indexterm>
@@ -23427,25 +23443,6 @@ SELECT * FROM pg_ls_dir('.') WITH ORDINALITY AS t(ls,n);
2342723443
This is equivalent to <function>current_user</function>.
2342823444
</para></entry>
2342923445
</row>
23430-
23431-
<row>
23432-
<entry role="func_table_entry"><para role="func_signature">
23433-
<indexterm>
23434-
<primary>version</primary>
23435-
</indexterm>
23436-
<function>version</function> ()
23437-
<returnvalue>text</returnvalue>
23438-
</para>
23439-
<para>
23440-
Returns a string describing the <productname>PostgreSQL</productname>
23441-
server's version. You can also get this information from
23442-
<xref linkend="guc-server-version"/>, or for a machine-readable
23443-
version use <xref linkend="guc-server-version-num"/>. Software
23444-
developers should use <varname>server_version_num</varname> (available
23445-
since 8.2) or <xref linkend="libpq-PQserverVersion"/> instead of
23446-
parsing the text version.
23447-
</para></entry>
23448-
</row>
2344923446
</tbody>
2345023447
</tgroup>
2345123448
</table>
@@ -26332,6 +26329,80 @@ SELECT collation for ('foo' COLLATE "de_DE");
2633226329

2633326330
</sect2>
2633426331

26332+
<sect2 id="functions-info-version">
26333+
<title>Version Information Functions</title>
26334+
26335+
<para>
26336+
The functions shown in <xref linkend="functions-version"/>
26337+
print version information.
26338+
</para>
26339+
26340+
<table id="functions-version">
26341+
<title>Version Information Functions</title>
26342+
<tgroup cols="1">
26343+
<thead>
26344+
<row>
26345+
<entry role="func_table_entry"><para role="func_signature">
26346+
Function
26347+
</para>
26348+
<para>
26349+
Description
26350+
</para></entry>
26351+
</row>
26352+
</thead>
26353+
26354+
<tbody>
26355+
<row>
26356+
<entry role="func_table_entry"><para role="func_signature">
26357+
<indexterm>
26358+
<primary>version</primary>
26359+
</indexterm>
26360+
<function>version</function> ()
26361+
<returnvalue>text</returnvalue>
26362+
</para>
26363+
<para>
26364+
Returns a string describing the <productname>PostgreSQL</productname>
26365+
server's version. You can also get this information from
26366+
<xref linkend="guc-server-version"/>, or for a machine-readable
26367+
version use <xref linkend="guc-server-version-num"/>. Software
26368+
developers should use <varname>server_version_num</varname> (available
26369+
since 8.2) or <xref linkend="libpq-PQserverVersion"/> instead of
26370+
parsing the text version.
26371+
</para></entry>
26372+
</row>
26373+
26374+
<row>
26375+
<entry role="func_table_entry"><para role="func_signature">
26376+
<indexterm>
26377+
<primary>unicode_version</primary>
26378+
</indexterm>
26379+
<function>unicode_version</function> ()
26380+
<returnvalue>text</returnvalue>
26381+
</para>
26382+
<para>
26383+
Returns a string representing the version of Unicode used by
26384+
<productname>PostgreSQL</productname>.
26385+
</para></entry>
26386+
</row>
26387+
<row>
26388+
<entry role="func_table_entry"><para role="func_signature">
26389+
<indexterm>
26390+
<primary>icu_unicode_version</primary>
26391+
</indexterm>
26392+
<function>icu_unicode_version</function> ()
26393+
<returnvalue>text</returnvalue>
26394+
</para>
26395+
<para>
26396+
Returns a string representing the version of Unicode used by ICU, if
26397+
the server was built with ICU support; otherwise returns
26398+
<literal>NULL</literal> </para></entry>
26399+
</row>
26400+
</tbody>
26401+
</tgroup>
26402+
</table>
26403+
26404+
</sect2>
26405+
2633526406
</sect1>
2633626407

2633726408
<sect1 id="functions-admin">

‎src/backend/utils/adt/varlena.c

Copy file name to clipboardExpand all lines: src/backend/utils/adt/varlena.c
+61
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
#include "catalog/pg_type.h"
2424
#include "common/hashfn.h"
2525
#include "common/int.h"
26+
#include "common/unicode_category.h"
2627
#include "common/unicode_norm.h"
28+
#include "common/unicode_version.h"
2729
#include "funcapi.h"
2830
#include "lib/hyperloglog.h"
2931
#include "libpq/pqformat.h"
@@ -6237,6 +6239,65 @@ unicode_norm_form_from_string(const char *formstr)
62376239
return form;
62386240
}
62396241

6242+
/*
6243+
* Returns version of Unicode used by Postgres in "major.minor" format (the
6244+
* same format as the Unicode version reported by ICU). The third component
6245+
* ("update version") never involves additions to the character repertiore and
6246+
* is unimportant for most purposes.
6247+
*
6248+
* See: https://unicode.org/versions/
6249+
*/
6250+
Datum
6251+
unicode_version(PG_FUNCTION_ARGS)
6252+
{
6253+
PG_RETURN_TEXT_P(cstring_to_text(PG_UNICODE_VERSION));
6254+
}
6255+
6256+
/*
6257+
* Returns version of Unicode used by ICU, if enabled; otherwise NULL.
6258+
*/
6259+
Datum
6260+
icu_unicode_version(PG_FUNCTION_ARGS)
6261+
{
6262+
#ifdef USE_ICU
6263+
PG_RETURN_TEXT_P(cstring_to_text(U_UNICODE_VERSION));
6264+
#else
6265+
PG_RETURN_NULL();
6266+
#endif
6267+
}
6268+
6269+
/*
6270+
* Check whether the string contains only assigned Unicode code
6271+
* points. Requires that the database encoding is UTF-8.
6272+
*/
6273+
Datum
6274+
unicode_assigned(PG_FUNCTION_ARGS)
6275+
{
6276+
text *input = PG_GETARG_TEXT_PP(0);
6277+
unsigned char *p;
6278+
int size;
6279+
6280+
if (GetDatabaseEncoding() != PG_UTF8)
6281+
ereport(ERROR,
6282+
(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
6283+
6284+
/* convert to pg_wchar */
6285+
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
6286+
p = (unsigned char *) VARDATA_ANY(input);
6287+
for (int i = 0; i < size; i++)
6288+
{
6289+
pg_wchar uchar = utf8_to_unicode(p);
6290+
int category = unicode_category(uchar);
6291+
6292+
if (category == PG_U_UNASSIGNED)
6293+
PG_RETURN_BOOL(false);
6294+
6295+
p += pg_utf_mblen(p);
6296+
}
6297+
6298+
PG_RETURN_BOOL(true);
6299+
}
6300+
62406301
Datum
62416302
unicode_normalize_func(PG_FUNCTION_ARGS)
62426303
{

‎src/common/Makefile

Copy file name to clipboardExpand all lines: src/common/Makefile
+1
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ OBJS_COMMON = \
7878
scram-common.o \
7979
string.o \
8080
stringinfo.o \
81+
unicode_category.o \
8182
unicode_norm.o \
8283
username.o \
8384
wait_error.o \

‎src/common/meson.build

Copy file name to clipboardExpand all lines: src/common/meson.build
+1
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ common_sources = files(
3030
'scram-common.c',
3131
'string.c',
3232
'stringinfo.c',
33+
'unicode_category.c',
3334
'unicode_norm.c',
3435
'username.c',
3536
'wait_error.c',

‎src/common/unicode/Makefile

Copy file name to clipboardExpand all lines: src/common/unicode/Makefile
+17-2
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,15 @@ include $(top_builddir)/src/Makefile.global
1515
override CPPFLAGS := -DFRONTEND -I. $(CPPFLAGS)
1616
LIBS += $(PTHREAD_LIBS)
1717

18+
LDFLAGS_INTERNAL += $(ICU_LIBS)
19+
CPPFLAGS += $(ICU_CFLAGS)
20+
1821
# By default, do nothing.
1922
all:
2023

21-
update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h
24+
update-unicode: unicode_category_table.h unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asian_fw_table.h unicode_normprops_table.h unicode_norm_hashfunc.h unicode_version.h
2225
mv $^ $(top_srcdir)/src/include/common/
26+
$(MAKE) category-check
2327
$(MAKE) normalization-check
2428

2529
# These files are part of the Unicode Character Database. Download
@@ -28,6 +32,12 @@ update-unicode: unicode_norm_table.h unicode_nonspacing_table.h unicode_east_asi
2832
UnicodeData.txt EastAsianWidth.txt DerivedNormalizationProps.txt CompositionExclusions.txt NormalizationTest.txt: $(top_builddir)/src/Makefile.global
2933
$(DOWNLOAD) https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$(@F)
3034

35+
unicode_version.h: generate-unicode_version.pl
36+
$(PERL) $< --version $(UNICODE_VERSION)
37+
38+
unicode_category_table.h: generate-unicode_category_table.pl UnicodeData.txt
39+
$(PERL) $<
40+
3141
# Generation of conversion tables used for string normalization with
3242
# UTF-8 strings.
3343
unicode_norm_hashfunc.h: unicode_norm_table.h
@@ -45,9 +55,14 @@ unicode_normprops_table.h: generate-unicode_normprops_table.pl DerivedNormalizat
4555
$(PERL) $^ >$@
4656

4757
# Test suite
58+
category-check: category_test
59+
./category_test
60+
4861
normalization-check: norm_test
4962
./norm_test
5063

64+
category_test: category_test.o ../unicode_category.o | submake-common
65+
5166
norm_test: norm_test.o ../unicode_norm.o | submake-common
5267

5368
norm_test.o: norm_test_table.h
@@ -64,7 +79,7 @@ norm_test_table.h: generate-norm_test_table.pl NormalizationTest.txt
6479

6580

6681
clean:
67-
rm -f $(OBJS) norm_test norm_test.o
82+
rm -f $(OBJS) category_test category_test.o norm_test norm_test.o
6883

6984
distclean: clean
7085
rm -f UnicodeData.txt EastAsianWidth.txt CompositionExclusions.txt NormalizationTest.txt norm_test_table.h unicode_norm_table.h

‎src/common/unicode/category_test.c

Copy file name to clipboard
+108
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/*-------------------------------------------------------------------------
2+
* category_test.c
3+
* Program to test Unicode general category functions.
4+
*
5+
* Portions Copyright (c) 2017-2023, PostgreSQL Global Development Group
6+
*
7+
* IDENTIFICATION
8+
* src/common/unicode/category_test.c
9+
*
10+
*-------------------------------------------------------------------------
11+
*/
12+
#include "postgres_fe.h"
13+
14+
#include <stdio.h>
15+
#include <stdlib.h>
16+
#include <string.h>
17+
18+
#ifdef USE_ICU
19+
#include <unicode/uchar.h>
20+
#endif
21+
#include "common/unicode_category.h"
22+
#include "common/unicode_version.h"
23+
24+
/*
25+
* Parse version into integer for easy comparison.
26+
*/
27+
#ifdef USE_ICU
28+
static int
29+
parse_unicode_version(const char *version)
30+
{
31+
int n,
32+
major,
33+
minor;
34+
35+
n = sscanf(version, "%d.%d", &major, &minor);
36+
37+
Assert(n == 2);
38+
Assert(minor < 100);
39+
40+
return major * 100 + minor;
41+
}
42+
#endif
43+
44+
/*
45+
* Exhaustively test that the Unicode category for each codepoint matches that
46+
* returned by ICU.
47+
*/
48+
int
49+
main(int argc, char **argv)
50+
{
51+
#ifdef USE_ICU
52+
int pg_unicode_version = parse_unicode_version(PG_UNICODE_VERSION);
53+
int icu_unicode_version = parse_unicode_version(U_UNICODE_VERSION);
54+
int pg_skipped_codepoints = 0;
55+
int icu_skipped_codepoints = 0;
56+
57+
printf("Postgres Unicode Version:\t%s\n", PG_UNICODE_VERSION);
58+
printf("ICU Unicode Version:\t\t%s\n", U_UNICODE_VERSION);
59+
60+
for (UChar32 code = 0; code <= 0x10ffff; code++)
61+
{
62+
uint8_t pg_category = unicode_category(code);
63+
uint8_t icu_category = u_charType(code);
64+
65+
if (pg_category != icu_category)
66+
{
67+
/*
68+
* A version mismatch means that some assigned codepoints in the
69+
* newer version may be unassigned in the older version. That's
70+
* OK, though the test will not cover those codepoints marked
71+
* unassigned in the older version (that is, it will no longer be
72+
* an exhaustive test).
73+
*/
74+
if (pg_category == PG_U_UNASSIGNED &&
75+
pg_unicode_version < icu_unicode_version)
76+
pg_skipped_codepoints++;
77+
else if (icu_category == PG_U_UNASSIGNED &&
78+
icu_unicode_version < pg_unicode_version)
79+
icu_skipped_codepoints++;
80+
else
81+
{
82+
printf("FAILURE for codepoint %06x\n", code);
83+
printf("Postgres category: %02d %s %s\n", pg_category,
84+
unicode_category_abbrev(pg_category),
85+
unicode_category_string(pg_category));
86+
printf("ICU category: %02d %s %s\n", icu_category,
87+
unicode_category_abbrev(icu_category),
88+
unicode_category_string(icu_category));
89+
printf("\n");
90+
exit(1);
91+
}
92+
}
93+
}
94+
95+
if (pg_skipped_codepoints > 0)
96+
printf("Skipped %d codepoints unassigned in Postgres due to Unicode version mismatch.\n",
97+
pg_skipped_codepoints);
98+
if (icu_skipped_codepoints > 0)
99+
printf("Skipped %d codepoints unassigned in ICU due to Unicode version mismatch.\n",
100+
icu_skipped_codepoints);
101+
102+
printf("category_test: All tests successful!\n");
103+
exit(0);
104+
#else
105+
printf("ICU support required for test; skipping.\n");
106+
exit(0);
107+
#endif
108+
}

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.