/* $NetBSD: casefold.c,v 1.3 2020/03/18 19:05:21 christos Exp $ */
/*++
/* NAME
/* casefold 3
/* SUMMARY
/* casefold text for caseless comparison
/* SYNOPSIS
/* #include <stringops.h>
/*
/* char *casefold(
/* VSTRING *dst,
/* const char *src)
/*
/* char *casefold_append(
/* VSTRING *dst,
/* const char *src)
/*
/* char *casefold_len(
/* VSTRING *dst,
/* const char *src,
/* ssize_t src_len)
/* AUXILIARY FUNCTIONS
/* char *casefoldx(
/* int flags,
/* VSTRING *dst,
/* const char *src,
/* ssize_t src_len)
/* DESCRIPTION
/* casefold() converts text to a form that is suitable for
/* caseless comparison, rather than presentation to humans.
/*
/* When compiled without EAI support or util_utf8_enable is
/* zero, casefold() implements ASCII case folding, leaving
/* non-ASCII byte values unchanged.
/*
/* When compiled with EAI support and util_utf8_enable is
/* non-zero, casefold() implements UTF-8 case folding using
/* the en_US locale, as recommended when the conversion result
/* is not meant to be presented to humans.
/*
/* casefold_len() implements casefold() with a source length
/* argument.
/*
/* casefold_append() implements casefold() without overwriting
/* the result.
/*
/* casefoldx() implements a more complex API that implements
/* all of the above and more.
/*
/* Arguments:
/* .IP src
/* Null-terminated input string.
/* .IP dest
/* Output buffer, null-terminated. Specify a null pointer to
/* use an internal buffer that is overwritten upon each call.
/* .IP src_len
/* The string length, -1 to determine the length dynamically.
/* .IP flags
/* Bitwise OR of zero or more of the following:
/* .RS
/* .IP CASEF_FLAG_UTF8
/* Enable UTF-8 support. This flag has no effect when compiled
/* without EAI support.
/* .IP CASEF_FLAG_APPEND
/* Append the result to the buffer, instead of overwriting it.
/* DIAGNOSTICS
/* All errors are fatal. There appear to be no input-dependent
/* errors.
/*
/* With the ICU 4.8 library, there is no casefold error for
/* UTF-8 code points U+0000..U+10FFFF (including surrogate
/* range), not even when running inside an empty chroot jail.
/* Nor does malformed UTF-8 trigger errors; non-UTF-8 bytes
/* are copied verbatim. Based on ICU 4.8 source-code review
/* and experimentation(!) we conclude that UTF-8 casefolding
/* has no data-dependent error cases, and that it is safe to
/* treat all casefolding errors as fatal runtime errors.
/* LICENSE
/* .ad
/* .fi
/* The Secure Mailer license must be distributed with this software.
/* AUTHOR(S)
/* Wietse Venema
/* IBM T.J. Watson Research
/* P.O. Box 704
/* Yorktown Heights, NY 10598, USA
/*
/* Wietse Venema
/* Google, Inc.
/* 111 8th Avenue
/* New York, NY 10011, USA
/*--*/
/* System library. */
#include <sys_defs.h>
#include <string.h>
#include <ctype.h>
#ifndef NO_EAI
#include <unicode/ucasemap.h>
#include <unicode/ustring.h>
#include <unicode/uchar.h>
#endif
/* Utility library. */
#include <msg.h>
#include <stringops.h>
#define STR(x) vstring_str(x)
#define LEN(x) VSTRING_LEN(x)
/* casefoldx - casefold an UTF-8 string */
char *casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len)
{
size_t old_len;
#ifdef NO_EAI
/*
* ASCII mode only.
*/
if (len < 0)
len = strlen(src);
if ((flags & CASEF_FLAG_APPEND) == 0)
VSTRING_RESET(dest);
old_len = VSTRING_LEN(dest);
vstring_strncat(dest, src, len);
lowercase(STR(dest) + old_len);
return (STR(dest));
#else
/*
* Unicode mode.
*/
const char myname[] = "casefold";
static VSTRING *fold_buf = 0;
static UCaseMap *csm = 0;
UErrorCode error;
ssize_t space_needed;
int n;
/*
* Handle special cases.
*/
if (len < 0)
len = strlen(src);
if (dest == 0)
dest = (fold_buf != 0 ? fold_buf : (fold_buf = vstring_alloc(100)));
if ((flags & CASEF_FLAG_APPEND) == 0)
VSTRING_RESET(dest);
old_len = VSTRING_LEN(dest);
/*
* All-ASCII input, or ASCII mode only.
*/
if ((flags & CASEF_FLAG_UTF8) == 0 || allascii(src)) {
vstring_strncat(dest, src, len);
lowercase(STR(dest) + old_len);
return (STR(dest));
}
/*
* ICU 4.8 ucasemap_utf8FoldCase() does not complain about UTF-8 syntax
* errors. XXX Based on source-code review we conclude that non-UTF-8
* bytes are copied verbatim, and experiments confirm this. Given that
* this behavior is intentional, we assume that it will stay that way.
*/
#if 0
if (valid_utf8_string(src, len) == 0) {
if (err)
*err = "malformed UTF-8 or invalid codepoint";
return (0);
}
#endif
/*
* One-time initialization. With ICU 4.8 this works while chrooted.
*/
if (csm == 0) {
error = U_ZERO_ERROR;
csm = ucasemap_open("en_US", U_FOLD_CASE_DEFAULT, &error);
if (U_SUCCESS(error) == 0)
msg_fatal("ucasemap_open error: %s", u_errorName(error));
}
/*
* Fold the input, adjusting the buffer size if needed. Safety: don't
* loop forever.
*
* Note: the requested amount of space for casemapped output (as reported
* with space_needed below) does not include storage for the null
* terminator. The terminator is written only when the output buffer is
* large enough. This is why we overallocate space when the output does
* not fit. But if the output fits exactly, then the output will be
* unterminated, and we have to terminate the output ourselves.
*/
for (n = 0; n < 3; n++) {
error = U_ZERO_ERROR;
space_needed = ucasemap_utf8FoldCase(csm, STR(dest) + old_len,
vstring_avail(dest), src, len, &error);
if (U_SUCCESS(error)) {
vstring_set_payload_size(dest, old_len + space_needed);
if (vstring_avail(dest) == 0) /* exact fit, no terminator */
VSTRING_TERMINATE(dest); /* add terminator */
break;
} else if (error == U_BUFFER_OVERFLOW_ERROR) {
VSTRING_SPACE(dest, space_needed + 1); /* for terminator */
} else {
msg_fatal("%s: conversion error for \"%s\": %s",
myname, src, u_errorName(error));
}
}
return (STR(dest));
#endif /* NO_EAI */
}
#ifdef TEST
static void encode_utf8(VSTRING *buffer, int codepoint)
{
const char myname[] = "encode_utf8";
VSTRING_RESET(buffer);
if (codepoint < 0x80) {
VSTRING_ADDCH(buffer, codepoint);
} else if (codepoint < 0x800) {
VSTRING_ADDCH(buffer, 0xc0 | (codepoint >> 6));
VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
} else if (codepoint < 0x10000) {
VSTRING_ADDCH(buffer, 0xe0 | (codepoint >> 12));
VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
} else if (codepoint <= 0x10FFFF) {
VSTRING_ADDCH(buffer, 0xf0 | (codepoint >> 18));
VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 12) & 0x3f));
VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
} else {
msg_panic("%s: out-of-range codepoint U+%X", myname, codepoint);
}
VSTRING_TERMINATE(buffer);
}
#include <stdlib.h>
#include <stdio.h>
#include <locale.h>
#include <vstream.h>
#include <vstring_vstream.h>
#include <msg_vstream.h>
int main(int argc, char **argv)
{
VSTRING *buffer = vstring_alloc(1);
VSTRING *dest = vstring_alloc(1);
char *bp;
char *conv_res;
char *cmd;
int codepoint, first, last;
VSTREAM *fp;
if (setlocale(LC_ALL, "C") == 0)
msg_fatal("setlocale(LC_ALL, C) failed: %m");
msg_vstream_init(argv[0], VSTREAM_ERR);
util_utf8_enable = 1;
VSTRING_SPACE(buffer, 256); /* chroot/file pathname */
while (vstring_fgets_nonl(buffer, VSTREAM_IN)) {
bp = STR(buffer);
vstream_printf("> %s\n", bp);
cmd = mystrtok(&bp, CHARS_SPACE);
if (cmd == 0 || *cmd == '#')
continue;
while (ISSPACE(*bp))
bp++;
/*
* Null-terminated string.
*/
if (strcmp(cmd, "fold") == 0) {
conv_res = casefold(dest, bp);
vstream_printf("\"%s\" ->fold \"%s\"\n", bp, conv_res);
}
/*
* Codepoint range.
*/
else if (strcmp(cmd, "range") == 0
&& sscanf(bp, "%i %i", &first, &last) == 2
&& first <= last) {
for (codepoint = first; codepoint <= last; codepoint++) {
if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
vstream_printf("skipping surrogate range\n");
codepoint = 0xDFFF;
} else {
encode_utf8(buffer, codepoint);
if (msg_verbose)
vstream_printf("U+%X -> %s\n", codepoint, STR(buffer));
if (valid_utf8_string(STR(buffer), LEN(buffer)) == 0)
msg_fatal("bad utf-8 encoding for U+%X\n", codepoint);
casefold(dest, STR(buffer));
}
}
vstream_printf("range completed: 0x%x..0x%x\n", first, last);
}
/*
* Chroot directory.
*/
else if (strcmp(cmd, "chroot") == 0
&& sscanf(bp, "%255s", STR(buffer)) == 1) {
if (geteuid() == 0) {
if (chdir(STR(buffer)) < 0)
msg_fatal("chdir(%s): %m", STR(buffer));
if (chroot(STR(buffer)) < 0)
msg_fatal("chroot(%s): %m", STR(buffer));
vstream_printf("chroot %s completed\n", STR(buffer));
}
}
/*
* File.
*/
else if (strcmp(cmd, "file") == 0
&& sscanf(bp, "%255s", STR(buffer)) == 1) {
if ((fp = vstream_fopen(STR(buffer), O_RDONLY, 0)) == 0)
msg_fatal("open(%s): %m", STR(buffer));
while (vstring_fgets_nonl(buffer, fp))
vstream_printf("%s\n", casefold(dest, STR(buffer)));
vstream_fclose(fp);
}
/*
* Verbose.
*/
else if (strcmp(cmd, "verbose") == 0
&& sscanf(bp, "%i", &msg_verbose) == 1) {
/* void */ ;
}
/*
* Usage
*/
else {
vstream_printf("Usage: %s chroot <path> | file <path> | fold <text> | range <first> <last> | verbose <int>\n",
argv[0]);
}
vstream_fflush(VSTREAM_OUT);
}
vstring_free(buffer);
vstring_free(dest);
exit(0);
}
#endif /* TEST */