Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 51e78ab

Browse filesBrowse files
committed
Avoid use of sscanf() to parse ispell dictionary files.
It turns out that on FreeBSD-derived platforms (including OS X), the *scanf() family of functions is pretty much brain-dead about multibyte characters. In particular it will apply isspace() to individual bytes of input even when those bytes are part of a multibyte character, thus allowing false recognition of a field-terminating space. We appear to have little alternative other than instituting a coding rule that *scanf() is not to be used if the input string might contain multibyte characters. (There was some discussion of relying on "%ls", but that probably just moves the portability problem somewhere else, and besides it doesn't fully prevent BSD *scanf() from using isspace().) This patch is a down payment on that: it gets rid of use of sscanf() to parse ispell dictionary files, which are certainly at great risk of having a problem. The code is cleaner this way anyway, though a bit longer. In passing, improve a few comments. Report and patch by Artur Zakirov, reviewed and somewhat tweaked by me. Back-patch to all supported branches.
1 parent c5e9b77 commit 51e78ab
Copy full SHA for 51e78ab

File tree

Expand file treeCollapse file tree

1 file changed

+153
-13
lines changed
Filter options
Expand file treeCollapse file tree

1 file changed

+153
-13
lines changed

‎src/backend/tsearch/spell.c

Copy file name to clipboardExpand all lines: src/backend/tsearch/spell.c
+153-13Lines changed: 153 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -457,13 +457,149 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
457457
Conf->naffixes++;
458458
}
459459

460+
461+
/* Parsing states for parse_affentry() and friends */
460462
#define PAE_WAIT_MASK 0
461-
#define PAE_INMASK 1
463+
#define PAE_INMASK 1
462464
#define PAE_WAIT_FIND 2
463-
#define PAE_INFIND 3
465+
#define PAE_INFIND 3
464466
#define PAE_WAIT_REPL 4
465-
#define PAE_INREPL 5
467+
#define PAE_INREPL 5
468+
#define PAE_WAIT_TYPE 6
469+
#define PAE_WAIT_FLAG 7
466470

471+
/*
472+
* Parse next space-separated field of an .affix file line.
473+
*
474+
* *str is the input pointer (will be advanced past field)
475+
* next is where to copy the field value to, with null termination
476+
*
477+
* The buffer at "next" must be of size BUFSIZ; we truncate the input to fit.
478+
*
479+
* Returns TRUE if we found a field, FALSE if not.
480+
*/
481+
static bool
482+
get_nextfield(char **str, char *next)
483+
{
484+
int state = PAE_WAIT_MASK;
485+
int avail = BUFSIZ;
486+
487+
while (**str)
488+
{
489+
if (state == PAE_WAIT_MASK)
490+
{
491+
if (t_iseq(*str, '#'))
492+
return false;
493+
else if (!t_isspace(*str))
494+
{
495+
int clen = pg_mblen(*str);
496+
497+
if (clen < avail)
498+
{
499+
COPYCHAR(next, *str);
500+
next += clen;
501+
avail -= clen;
502+
}
503+
state = PAE_INMASK;
504+
}
505+
}
506+
else /* state == PAE_INMASK */
507+
{
508+
if (t_isspace(*str))
509+
{
510+
*next = '\0';
511+
return true;
512+
}
513+
else
514+
{
515+
int clen = pg_mblen(*str);
516+
517+
if (clen < avail)
518+
{
519+
COPYCHAR(next, *str);
520+
next += clen;
521+
avail -= clen;
522+
}
523+
}
524+
}
525+
*str += pg_mblen(*str);
526+
}
527+
528+
*next = '\0';
529+
530+
return (state == PAE_INMASK); /* OK if we got a nonempty field */
531+
}
532+
533+
/*
534+
* Parses entry of an .affix file of MySpell or Hunspell format.
535+
*
536+
* An .affix file entry has the following format:
537+
* - header
538+
* <type> <flag> <cross_flag> <flag_count>
539+
* - fields after header:
540+
* <type> <flag> <find> <replace> <mask>
541+
*
542+
* str is the input line
543+
* field values are returned to type etc, which must be buffers of size BUFSIZ.
544+
*
545+
* Returns number of fields found; any omitted fields are set to empty strings.
546+
*/
547+
static int
548+
parse_ooaffentry(char *str, char *type, char *flag, char *find,
549+
char *repl, char *mask)
550+
{
551+
int state = PAE_WAIT_TYPE;
552+
int fields_read = 0;
553+
bool valid = false;
554+
555+
*type = *flag = *find = *repl = *mask = '\0';
556+
557+
while (*str)
558+
{
559+
switch (state)
560+
{
561+
case PAE_WAIT_TYPE:
562+
valid = get_nextfield(&str, type);
563+
state = PAE_WAIT_FLAG;
564+
break;
565+
case PAE_WAIT_FLAG:
566+
valid = get_nextfield(&str, flag);
567+
state = PAE_WAIT_FIND;
568+
break;
569+
case PAE_WAIT_FIND:
570+
valid = get_nextfield(&str, find);
571+
state = PAE_WAIT_REPL;
572+
break;
573+
case PAE_WAIT_REPL:
574+
valid = get_nextfield(&str, repl);
575+
state = PAE_WAIT_MASK;
576+
break;
577+
case PAE_WAIT_MASK:
578+
valid = get_nextfield(&str, mask);
579+
state = -1; /* force loop exit */
580+
break;
581+
default:
582+
elog(ERROR, "unrecognized state in parse_ooaffentry: %d",
583+
state);
584+
break;
585+
}
586+
if (valid)
587+
fields_read++;
588+
else
589+
break; /* early EOL */
590+
if (state < 0)
591+
break; /* got all fields */
592+
}
593+
594+
return fields_read;
595+
}
596+
597+
/*
598+
* Parses entry of an .affix file of Ispell format
599+
*
600+
* An .affix file entry has the following format:
601+
* <mask> > [-<find>,]<replace>
602+
*/
467603
static bool
468604
parse_affentry(char *str, char *mask, char *find, char *repl)
469605
{
@@ -618,8 +754,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
618754
int flag = 0;
619755
char flagflags = 0;
620756
tsearch_readline_state trst;
621-
int scanread = 0;
622-
char scanbuf[BUFSIZ];
623757
char *recoded;
624758

625759
/* read file to find any flag */
@@ -682,8 +816,6 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
682816
}
683817
tsearch_readline_end(&trst);
684818

685-
sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
686-
687819
if (!tsearch_readline_begin(&trst, filename))
688820
ereport(ERROR,
689821
(errcode(ERRCODE_CONFIG_FILE_ERROR),
@@ -692,18 +824,21 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
692824

693825
while ((recoded = tsearch_readline(&trst)) != NULL)
694826
{
827+
int fields_read;
828+
695829
if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
696830
goto nextline;
697831

698-
scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
832+
fields_read = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
699833

700834
if (ptype)
701835
pfree(ptype);
702836
ptype = lowerstr_ctx(Conf, type);
703-
if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
837+
if (fields_read < 4 ||
838+
(STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
704839
goto nextline;
705840

706-
if (scanread == 4)
841+
if (fields_read == 4)
707842
{
708843
if (strlen(sflag) != 1)
709844
goto nextline;
@@ -722,9 +857,13 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
722857
if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
723858
goto nextline;
724859
prepl = lowerstr_ctx(Conf, repl);
725-
/* affix flag */
860+
/* Find position of '/' in lowercased string "prepl" */
726861
if ((ptr = strchr(prepl, '/')) != NULL)
727862
{
863+
/*
864+
* Here we use non-lowercased string "repl". We need position
865+
* of '/' in "repl".
866+
*/
728867
*ptr = '\0';
729868
ptr = repl + (ptr - prepl) + 1;
730869
while (*ptr)
@@ -800,11 +939,12 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
800939

801940
if (STRNCMP(pstr, "compoundwords") == 0)
802941
{
942+
/* Find position in lowercased string "pstr" */
803943
s = findchar(pstr, 'l');
804944
if (s)
805945
{
806-
s = recoded + (s - pstr); /* we need non-lowercased
807-
* string */
946+
/* Here we use non-lowercased string "recoded" */
947+
s = recoded + (s - pstr);
808948
while (*s && !t_isspace(s))
809949
s += pg_mblen(s);
810950
while (*s && t_isspace(s))

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.