Import Tcl 8.6.11

2021-03-30 00:51:39 +01:00
parent 3bb8e3e086
commit 1aadb2455c
923 changed files with 79104 additions and 62616 deletions
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -66,12 +66,31 @@ static const unsigned char totalBytes[256] = {
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
 #if TCL_UTF_MAX > 3
    4,4,4,4,4,
 #else
-    3,3,3,3,3, /* Tcl_UtfCharComplete() only checks TCL_UTF_MAX bytes */
+    1,1,1,1,1,
+#endif
+    1,1,1,1,1,1,1,1,1,1,1
+};
+
+static const unsigned char complete[256] = {
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+/* End of "continuation byte section" */
+    2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+#if TCL_UTF_MAX > 3
+    4,4,4,4,4,
+#else
+    3,3,3,3,3,
 #endif
    1,1,1,1,1,1,1,1,1,1,1
 };
@@ -81,6 +100,9 @@ static const unsigned char totalBytes[256] = {
 */

 static int		UtfCount(int ch);
+static int		Invalid(const char *src);
+static int		UCS4ToUpper(int ch);
+static int		UCS4ToTitle(int ch);

 /*
 *---------------------------------------------------------------------------
@@ -115,7 +137,68 @@ UtfCount(
 #endif
    return 3;
 }
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * Invalid --
+ *
+ *	Given a pointer to a two-byte prefix of a well-formed UTF-8 byte
+ *	sequence (a lead byte followed by a trail byte) this routine
+ *	examines those two bytes to determine whether the sequence is
+ *	invalid in UTF-8.  This might be because it is an overlong
+ *	encoding, or because it encodes something out of the proper range.
+ *
+ *	Given a pointer to the bytes \xF8 or \xFC , this routine will
+ *	try to read beyond the end of the "bounds" table.  Callers must
+ *	prevent this.
+ *
+ *	Given a pointer to something else (an ASCII byte, a trail byte,
+ *	or another byte	that can never begin a valid byte sequence such
+ *	as \xF5) this routine returns false.  That makes the routine poorly
+ *	named, as it does not detect and report all invalid sequences.
+ *
+ *	Callers have to take care that this routine does something useful
+ *	for their needs.
+ *
+ * Results:
+ *	A boolean.
+ *---------------------------------------------------------------------------
+ */

+static const unsigned char bounds[28] = {
+    0x80, 0x80,		/* \xC0 accepts \x80 only */
+    0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF,
+    0x80, 0xBF,		/* (\xC4 - \xDC) -- all sequences valid */
+    0xA0, 0xBF,	/* \xE0\x80 through \xE0\x9F are invalid prefixes */
+    0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, /* (\xE4 - \xEC) -- all valid */
+#if TCL_UTF_MAX > 3
+    0x90, 0xBF,	/* \xF0\x80 through \xF0\x8F are invalid prefixes */
+    0x80, 0x8F  /* \xF4\x90 and higher are invalid prefixes */
+#else
+    0xC0, 0xBF,	/* Not used, but reject all again for safety. */
+    0xC0, 0xBF	/* Not used, but reject all again for safety. */
+#endif
+};
+
+static int
+Invalid(
+    const char *src)	/* Points to lead byte of a UTF-8 byte sequence */
+{
+    unsigned char byte = UCHAR(*src);
+    int index;
+
+    if ((byte & 0xC3) == 0xC0) {
+	/* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */
+	index = (byte - 0xC0) >> 1;
+	if (UCHAR(src[1]) < bounds[index] || UCHAR(src[1]) > bounds[index+1]) {
+	    /* Out of bounds - report invalid. */
+	    return 1;
+	}
+    }
+    return 0;
+}
+
 /*
 *---------------------------------------------------------------------------
 *
@@ -278,8 +361,8 @@ Tcl_UniCharToUtfDString(
 *	If TCL_UTF_MAX <= 4, special handling of Surrogate pairs is done:
 *	For any UTF-8 string containing a character outside of the BMP, the
 *	first call to this function will fill *chPtr with the high surrogate
- *	and generate a return value of 0. Calling Tcl_UtfToUniChar again
- *	will produce the low surrogate and a return value of 4. Because *chPtr
+ *	and generate a return value of 1. Calling Tcl_UtfToUniChar again
+ *	will produce the low surrogate and a return value of 3. Because *chPtr
 *	is used to remember whether the high surrogate is already produced, it
 *	is recommended to initialize the variable it points to as 0 before
 *	the first call to Tcl_UtfToUniChar is done.
@@ -296,8 +379,8 @@ Tcl_UniCharToUtfDString(

 int
 Tcl_UtfToUniChar(
-    register const char *src,	/* The UTF-8 string. */
-    register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
+    const char *src,	/* The UTF-8 string. */
+    Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
 				 * the UTF-8 string. */
 {
    Tcl_UniChar byte;
@@ -306,7 +389,7 @@ Tcl_UtfToUniChar(
     * Unroll 1 to 3 (or 4) byte UTF-8 sequences.
     */

-    byte = *((unsigned char *) src);
+    byte = UCHAR(*src);
    if (byte < 0xC0) {
 	/*
 	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
@@ -320,10 +403,10 @@ Tcl_UtfToUniChar(
 	 * bytes, then we must produce a follow-up low surrogate. We only
 	 * do that if the high surrogate matches the bits we encounter.
 	 */
-	if ((byte >= 0x80)
+	if (((byte & 0xC0) == 0x80)
+		&& ((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)
 		&& (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC))
-		&& ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))
-		&& ((src[2] & 0xC0) == 0x80)) {
+		&& ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))) {
 	    *chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00;
 	    return 3;
 	}
@@ -364,26 +447,28 @@ Tcl_UtfToUniChar(
 	 * represents itself.
 	 */
    }
-    else if (byte < 0xF8) {
-	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
+    else if (byte < 0xF5) {
+	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
 	    /*
-	     * Four-byte-character lead byte followed by three trail bytes.
+	     * Four-byte-character lead byte followed by at least two trail bytes.
+	     * We don't test the validity of 3th trail byte, see [ed29806ba]
 	     */
 #if TCL_UTF_MAX <= 4
 	    Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
 		    | ((src[2] & 0x3F) >> 4)) - 0x40;
-	    if (high >= 0x400) {
-		/* out of range, < 0x10000 or > 0x10ffff */
-	    } else {
+	    if (high < 0x400) {
 		/* produce high surrogate, advance source pointer */
 		*chPtr = 0xD800 + high;
 		return 1;
 	    }
+	    /* out of range, < 0x10000 or > 0x10FFFF */
 #else
-	    *chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
-		    | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
-	    if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
-		return 4;
+	    if ((src[3] & 0xC0) == 0x80) {
+		*chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
+			| ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
+		if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
+		    return 4;
+		}
 	    }
 #endif
 	}
@@ -426,8 +511,12 @@ Tcl_UtfToUniCharDString(
 				 * DString. */
 {
    Tcl_UniChar ch = 0, *w, *wString;
-    const char *p, *end;
+    const char *p;
    int oldLength;
+    /* Pointer to the end of string. Never read endPtr[0] */
+    const char *endPtr = src + length;
+    /* Pointer to last byte where optimization still can be used */
+    const char *optPtr = endPtr - TCL_UTF_MAX;

    if (length < 0) {
 	length = strlen(src);
@@ -441,28 +530,28 @@ Tcl_UtfToUniCharDString(
    oldLength = Tcl_DStringLength(dsPtr);

    Tcl_DStringSetLength(dsPtr,
-	    oldLength + (int) ((length + 1) * sizeof(Tcl_UniChar)));
+	    oldLength + ((length + 1) * sizeof(Tcl_UniChar)));
    wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);

    w = wString;
    p = src;
-    end = src + length - TCL_UTF_MAX;
-    while (p < end) {
+    endPtr = src + length;
+    optPtr = endPtr - TCL_UTF_MAX;
+    while (p <= optPtr) {
 	p += TclUtfToUniChar(p, &ch);
 	*w++ = ch;
    }
-    end += TCL_UTF_MAX;
-    while (p < end) {
-	if (Tcl_UtfCharComplete(p, end-p)) {
+    while (p < endPtr) {
+	if (Tcl_UtfCharComplete(p, endPtr-p)) {
 	    p += TclUtfToUniChar(p, &ch);
+	    *w++ = ch;
 	} else {
-	    ch = UCHAR(*p++);
+	    *w++ = UCHAR(*p++);
 	}
-	*w++ = ch;
    }
    *w = '\0';
    Tcl_DStringSetLength(dsPtr,
-	    (oldLength + ((char *) w - (char *) wString)));
+	    oldLength + ((char *) w - (char *) wString));

    return wString;
 }
@@ -492,7 +581,7 @@ Tcl_UtfCharComplete(
 				 * a complete UTF-8 character. */
    int length)			/* Length of above string in bytes. */
 {
-    return length >= totalBytes[(unsigned char)*src];
+    return length >= complete[UCHAR(*src)];
 }

 /*
@@ -515,41 +604,52 @@ Tcl_UtfCharComplete(

 int
 Tcl_NumUtfChars(
-    register const char *src,	/* The UTF-8 string to measure. */
-    int length)			/* The length of the string in bytes, or -1
-				 * for strlen(string). */
+    const char *src,	/* The UTF-8 string to measure. */
+    int length)		/* The length of the string in bytes, or -1
+			 * for strlen(string). */
 {
    Tcl_UniChar ch = 0;
-    register int i = 0;
-
-    /*
-     * The separate implementations are faster.
-     *
-     * Since this is a time-sensitive function, we also do the check for the
-     * single-byte char case specially.
-     */
+    int i = 0;

    if (length < 0) {
-	while (*src != '\0') {
+	/* string is NUL-terminated, so TclUtfToUniChar calls are safe. */
+	while ((*src != '\0') && (i < INT_MAX)) {
 	    src += TclUtfToUniChar(src, &ch);
 	    i++;
 	}
-	if (i < 0) i = INT_MAX; /* Bug [2738427] */
    } else {
-	register const char *endPtr = src + length - TCL_UTF_MAX;
+	/* Will return value between 0 and length. No overflow checks. */

+	/* Pointer to the end of string. Never read endPtr[0] */
+	const char *endPtr = src + length;
+	/* Pointer to last byte where optimization still can be used */
+	const char *optPtr = endPtr - TCL_UTF_MAX;
+
+	/*
+	 * Optimize away the call in this loop. Justified because...
+	 * when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr)
+	 * By initialization above (endPtr - optPtr) = TCL_UTF_MAX
+	 * So (endPtr - src) >= TCL_UTF_MAX, and passing that to
+	 * Tcl_UtfCharComplete we know will cause return of 1.
+	 */
+	while (src <= optPtr
+		/* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
+	    src += TclUtfToUniChar(src, &ch);
+	    i++;
+	}
+	/* Loop over the remaining string where call must happen */
 	while (src < endPtr) {
-	    src += TclUtfToUniChar(src, &ch);
+	    if (Tcl_UtfCharComplete(src, endPtr - src)) {
+		src += TclUtfToUniChar(src, &ch);
+	    } else {
+		/*
+		 * src points to incomplete UTF-8 sequence
+		 * Treat first byte as character and count it
+		 */
+		src++;
+	    }
 	    i++;
 	}
-	endPtr += TCL_UTF_MAX;
-	while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
-	    src += TclUtfToUniChar(src, &ch);
-	    i++;
-	}
-	if (src < endPtr) {
-	    i += endPtr - src;
-	}
    }
    return i;
 }
@@ -559,7 +659,7 @@ Tcl_NumUtfChars(
 *
 * Tcl_UtfFindFirst --
 *
- *	Returns a pointer to the first occurance of the given Unicode character
+ *	Returns a pointer to the first occurrence of the given Unicode character
 *	in the NULL-terminated UTF-8 string. The NULL terminator is considered
 *	part of the UTF-8 string. Equivalent to Plan 9 utfrune().
 *
@@ -578,19 +678,10 @@ Tcl_UtfFindFirst(
    const char *src,		/* The UTF-8 string to be searched. */
    int ch)			/* The Unicode character to search for. */
 {
-    int len, fullchar;
-    Tcl_UniChar find = 0;
-
    while (1) {
-	len = TclUtfToUniChar(src, &find);
-	fullchar = find;
-#if TCL_UTF_MAX <= 4
-	if ((fullchar != ch) && (find >= 0xD800) && (len < 3)) {
-	    len += TclUtfToUniChar(src + len, &find);
-	    fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
-	}
-#endif
-	if (fullchar == ch) {
+	int find, len = TclUtfToUCS4(src, &find);
+
+	if (find == ch) {
 	    return src;
 	}
 	if (*src == '\0') {
@@ -605,7 +696,7 @@ Tcl_UtfFindFirst(
 *
 * Tcl_UtfFindLast --
 *
- *	Returns a pointer to the last occurance of the given Unicode character
+ *	Returns a pointer to the last occurrence of the given Unicode character
 *	in the NULL-terminated UTF-8 string. The NULL terminator is considered
 *	part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
 *
@@ -624,21 +715,12 @@ Tcl_UtfFindLast(
    const char *src,		/* The UTF-8 string to be searched. */
    int ch)			/* The Unicode character to search for. */
 {
-    int len, fullchar;
-    Tcl_UniChar find = 0;
-    const char *last;
+    const char *last = NULL;

-    last = NULL;
    while (1) {
-	len = TclUtfToUniChar(src, &find);
-	fullchar = find;
-#if TCL_UTF_MAX <= 4
-	if ((fullchar != ch) && (find >= 0xD800) && (len < 3)) {
-	    len += TclUtfToUniChar(src + len, &find);
-	    fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
-	}
-#endif
-	if (fullchar == ch) {
+	int find, len = TclUtfToUCS4(src, &find);
+
+	if (find == ch) {
 	    last = src;
 	}
 	if (*src == '\0') {
@@ -654,9 +736,11 @@ Tcl_UtfFindLast(
 *
 * Tcl_UtfNext --
 *
- *	Given a pointer to some current location in a UTF-8 string, move
- *	forward one character. The caller must ensure that they are not asking
- *	for the next character after the last character in the string.
+ *	Given a pointer to some location in a UTF-8 string, Tcl_UtfNext
+ *	returns a pointer to the next UTF-8 character in the string.
+ *	The caller must not ask for the next character after the last
+ *	character in the string if the string is not terminated by a null
+ *	character.
 *
 * Results:
 *	The return value is the pointer to the next character in the UTF-8
@@ -672,15 +756,33 @@ const char *
 Tcl_UtfNext(
    const char *src)		/* The current location in the string. */
 {
-    Tcl_UniChar ch = 0;
-    int len = TclUtfToUniChar(src, &ch);
+    int left;
+    const char *next;

-#if TCL_UTF_MAX <= 4
-    if ((ch >= 0xD800) && (len < 3)) {
-	len += TclUtfToUniChar(src + len, &ch);
+    left = totalBytes[UCHAR(*src)];
+    next = src + 1;
+    while (--left) {
+	if ((*next & 0xC0) != 0x80) {
+	    /*
+	     * src points to non-trail byte; We ran out of trail bytes
+	     * before the needs of the lead byte were satisfied.
+	     * Let the (malformed) lead byte alone be a character
+	     */
+	    return src + 1;
+	}
+	next++;
    }
-#endif
-    return src + len;
+    /*
+     * Call Invalid() here only if required conditions are met:
+     *    src[0] is known a lead byte.
+     *    src[1] is known a trail byte.
+     * Especially important to prevent calls when src[0] == '\xF8' or '\xFC'
+     * See tests utf-6.37 through utf-6.43 through valgrind or similar tool.
+     */
+    if ((next == src + 1) || Invalid(src)) {
+	return src + 1;
+    }
+    return next;
 }

 /*
@@ -706,31 +808,95 @@ Tcl_UtfNext(

 const char *
 Tcl_UtfPrev(
-    const char *src,		/* The current location in the string. */
-    const char *start)		/* Pointer to the beginning of the string, to
-				 * avoid going backwards too far. */
+    const char *src,		/* A location in a UTF-8 string. */
+    const char *start)		/* Pointer to the beginning of the string */
 {
-    const char *look;
-    int i, byte;
+    int trailBytesSeen = 0;	/* How many trail bytes have been verified? */
+    const char *fallback = src - 1;
+				/* If we cannot find a lead byte that might
+				 * start a prefix of a valid UTF byte sequence,
+				 * we will fallback to a one-byte back step */
+    const char *look = fallback;
+				/* Start search at the fallback position */
+
+    /* Quick boundary case exit. */
+    if (fallback <= start) {
+	return start;
+    }
+
+    do {
+	unsigned char byte = UCHAR(look[0]);

-    look = --src;
-    for (i = 0; i < 4; i++) {
-	if (look < start) {
-	    if (src < start) {
-		src = start;
-	    }
-	    break;
-	}
-	byte = *((unsigned char *) look);
 	if (byte < 0x80) {
-	    break;
+	    /*
+	     * Single byte character. Either this is a correct previous
+	     * character, or it is followed by at least one trail byte
+	     * which indicates a malformed sequence. In either case the
+	     * correct result is to return the fallback.
+	     */
+	    return fallback;
 	}
 	if (byte >= 0xC0) {
-	    return look;
+	    /* Non-trail byte; May be multibyte lead. */
+
+	    if ((trailBytesSeen == 0)
+		/*
+		 * We've seen no trailing context to use to check
+		 * anything. From what we know, this non-trail byte
+		 * is a prefix of a previous character, and accepting
+		 * it (the fallback) is correct.
+		 */
+
+		    || (trailBytesSeen >= totalBytes[byte])) {
+		/*
+		 * That is, (1 + trailBytesSeen > needed).
+		 * We've examined more bytes than needed to complete
+		 * this lead byte. No matter about well-formedness or
+		 * validity, the sequence starting with this lead byte
+		 * will never include the fallback location, so we must
+		 * return the fallback location. See test utf-7.17
+		 */
+		return fallback;
+	    }
+
+	    /*
+	     * trailBytesSeen > 0, so we can examine look[1] safely.
+	     * Use that capability to screen out invalid sequences.
+	     */
+
+	    if (Invalid(look)) {
+		/* Reject */
+		return fallback;
+	    }
+	    return (const char *)look;
 	}
+
+	/* We saw a trail byte. */
+	trailBytesSeen++;
+
+	if ((const char *)look == start) {
+	    /*
+	     * Do not read before the start of the string
+	     *
+	     * If we get here, we've examined bytes at every location
+	     * >= start and < src and all of them are trail bytes,
+	     * including (*start).  We need to return our fallback
+	     * and exit this loop before we run past the start of the string.
+	     */
+	    return fallback;
+	}
+
+	/* Continue the search backwards... */
 	look--;
-    }
-    return src;
+    } while (trailBytesSeen < 3);
+
+    /*
+     * We've seen 3 trail bytes, so we know there will not be a
+     * properly formed byte sequence to find, and we can stop looking,
+     * accepting the fallback.
+     */
+
+    return fallback;
 }

 /*
@@ -752,8 +918,8 @@ Tcl_UtfPrev(

 Tcl_UniChar
 Tcl_UniCharAtIndex(
-    register const char *src,	/* The UTF-8 string to dereference. */
-    register int index)		/* The position of the desired character. */
+    const char *src,	/* The UTF-8 string to dereference. */
+    int index)		/* The position of the desired character. */
 {
    Tcl_UniChar ch = 0;

@@ -782,8 +948,8 @@ Tcl_UniCharAtIndex(

 const char *
 Tcl_UtfAtIndex(
-    register const char *src,	/* The UTF-8 string. */
-    register int index)		/* The position of the desired character. */
+    const char *src,	/* The UTF-8 string. */
+    int index)		/* The position of the desired character. */
 {
    Tcl_UniChar ch = 0;
    int len = 0;
@@ -846,7 +1012,7 @@ Tcl_UtfBackslash(
 	 * We ate a whole line. Pay the price of a strlen()
 	 */

-	result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
+	result = TclParseBackslash(src, strlen(src), &numRead, dst);
    }
    if (readPtr != NULL) {
 	*readPtr = numRead;
@@ -876,7 +1042,7 @@ int
 Tcl_UtfToUpper(
    char *str)			/* String to convert in place. */
 {
-    Tcl_UniChar ch = 0, upChar;
+    int ch, upChar;
    char *src, *dst;
    int len;

@@ -886,8 +1052,8 @@ Tcl_UtfToUpper(

    src = dst = str;
    while (*src) {
-	len = TclUtfToUniChar(src, &ch);
-	upChar = Tcl_UniCharToUpper(ch);
+	len = TclUtfToUCS4(src, &ch);
+	upChar = UCS4ToUpper(ch);

 	/*
 	 * To keep badly formed Utf strings from getting inflated by the
@@ -899,7 +1065,7 @@ Tcl_UtfToUpper(
 	    memmove(dst, src, len);
 	    dst += len;
 	} else {
-	    dst += Tcl_UniCharToUtf(upChar, dst);
+	    dst += TclUCS4ToUtf(upChar, dst);
 	}
 	src += len;
    }
@@ -929,7 +1095,7 @@ int
 Tcl_UtfToLower(
    char *str)			/* String to convert in place. */
 {
-    Tcl_UniChar ch = 0, lowChar;
+    int ch, lowChar;
    char *src, *dst;
    int len;

@@ -939,8 +1105,8 @@ Tcl_UtfToLower(

    src = dst = str;
    while (*src) {
-	len = TclUtfToUniChar(src, &ch);
-	lowChar = Tcl_UniCharToLower(ch);
+	len = TclUtfToUCS4(src, &ch);
+	lowChar = TclUCS4ToLower(ch);

 	/*
 	 * To keep badly formed Utf strings from getting inflated by the
@@ -952,7 +1118,7 @@ Tcl_UtfToLower(
 	    memmove(dst, src, len);
 	    dst += len;
 	} else {
-	    dst += Tcl_UniCharToUtf(lowChar, dst);
+	    dst += TclUCS4ToUtf(lowChar, dst);
 	}
 	src += len;
    }
@@ -983,7 +1149,7 @@ int
 Tcl_UtfToTitle(
    char *str)			/* String to convert in place. */
 {
-    Tcl_UniChar ch = 0, titleChar, lowChar;
+    int ch, titleChar, lowChar;
    char *src, *dst;
    int len;

@@ -995,30 +1161,30 @@ Tcl_UtfToTitle(
    src = dst = str;

    if (*src) {
-	len = TclUtfToUniChar(src, &ch);
-	titleChar = Tcl_UniCharToTitle(ch);
+	len = TclUtfToUCS4(src, &ch);
+	titleChar = UCS4ToTitle(ch);

 	if (len < UtfCount(titleChar)) {
 	    memmove(dst, src, len);
 	    dst += len;
 	} else {
-	    dst += Tcl_UniCharToUtf(titleChar, dst);
+	    dst += TclUCS4ToUtf(titleChar, dst);
 	}
 	src += len;
    }
    while (*src) {
-	len = TclUtfToUniChar(src, &ch);
+	len = TclUtfToUCS4(src, &ch);
 	lowChar = ch;
 	/* Special exception for Georgian Asomtavruli chars, no titlecase. */
 	if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
-	    lowChar = Tcl_UniCharToLower(lowChar);
+	    lowChar = TclUCS4ToLower(lowChar);
 	}

 	if (len < UtfCount(lowChar)) {
 	    memmove(dst, src, len);
 	    dst += len;
 	} else {
-	    dst += Tcl_UniCharToUtf(lowChar, dst);
+	    dst += TclUCS4ToUtf(lowChar, dst);
 	}
 	src += len;
    }
@@ -1055,7 +1221,7 @@ TclpUtfNcmp2(
     * fine in the strcmp manner.
     */

-    register int result = 0;
+    int result = 0;

    for ( ; numBytes != 0; numBytes--, cs++, ct++) {
 	if (*cs != *ct) {
@@ -1116,11 +1282,11 @@ Tcl_UtfNcmp(
 	if (ch1 != ch2) {
 #if TCL_UTF_MAX == 4
 	    /* Surrogates always report higher than non-surrogates */
-	    if (((ch1 & 0xFC00) == 0xD800)) {
-	    if ((ch2 & 0xFC00) != 0xD800) {
+	    if (((ch1 & ~0x3FF) == 0xD800)) {
+	    if ((ch2 & ~0x3FF) != 0xD800) {
 		return ch1;
 	    }
-	    } else if ((ch2 & 0xFC00) == 0xD800) {
+	    } else if ((ch2 & ~0x3FF) == 0xD800) {
 		return -ch2;
 	    }
 #endif
@@ -1251,16 +1417,26 @@ TclUtfCasecmp(
 *----------------------------------------------------------------------
 */

+static int
+UCS4ToUpper(
+    int ch)			/* Unicode character to convert. */
+{
+    if (!UNICODE_OUT_OF_RANGE(ch)) {
+	int info = GetUniCharInfo(ch);
+
+	if (GetCaseType(info) & 0x04) {
+	    ch -= GetDelta(info);
+	}
+    }
+    /* Clear away extension bits, if any */
+    return ch & 0x1FFFFF;
+}
+
 Tcl_UniChar
 Tcl_UniCharToUpper(
    int ch)			/* Unicode character to convert. */
 {
-    int info = GetUniCharInfo(ch);
-
-    if (GetCaseType(info) & 0x04) {
-	ch -= GetDelta(info);
-    }
-    return (Tcl_UniChar) ch;
+    return (Tcl_UniChar) UCS4ToUpper(ch);
 }

 /*
@@ -1279,17 +1455,27 @@ Tcl_UniCharToUpper(
 *----------------------------------------------------------------------
 */

+int
+TclUCS4ToLower(
+    int ch)			/* Unicode character to convert. */
+{
+    if (!UNICODE_OUT_OF_RANGE(ch)) {
+	int info = GetUniCharInfo(ch);
+	int mode = GetCaseType(info);
+
+	if ((mode & 0x02) && (mode != 0x7)) {
+	    ch += GetDelta(info);
+	}
+    }
+    /* Clear away extension bits, if any */
+    return ch & 0x1FFFFF;
+}
+
 Tcl_UniChar
 Tcl_UniCharToLower(
    int ch)			/* Unicode character to convert. */
 {
-    int info = GetUniCharInfo(ch);
-    int mode = GetCaseType(info);
-
-    if ((mode & 0x02) && (mode != 0x7)) {
-	ch += GetDelta(info);
-    }
-    return (Tcl_UniChar) ch;
+    return (Tcl_UniChar) TclUCS4ToLower(ch);
 }

 /*
@@ -1308,25 +1494,35 @@ Tcl_UniCharToLower(
 *----------------------------------------------------------------------
 */

+static int
+UCS4ToTitle(
+    int ch)			/* Unicode character to convert. */
+{
+    if (!UNICODE_OUT_OF_RANGE(ch)) {
+	int info = GetUniCharInfo(ch);
+	int mode = GetCaseType(info);
+
+	if (mode & 0x1) {
+	    /*
+	     * Subtract or add one depending on the original case.
+	     */
+
+	    if (mode != 0x7) {
+		ch += ((mode & 0x4) ? -1 : 1);
+	    }
+	} else if (mode == 0x4) {
+	    ch -= GetDelta(info);
+	}
+    }
+    /* Clear away extension bits, if any */
+    return ch & 0x1FFFFF;
+}
+
 Tcl_UniChar
 Tcl_UniCharToTitle(
    int ch)			/* Unicode character to convert. */
 {
-    int info = GetUniCharInfo(ch);
-    int mode = GetCaseType(info);
-
-    if (mode & 0x1) {
-	/*
-	 * Subtract or add one depending on the original case.
-	 */
-
-	if (mode != 0x7) {
-	    ch += ((mode & 0x4) ? -1 : 1);
-	}
-    } else if (mode == 0x4) {
-	ch -= GetDelta(info);
-    }
-    return (Tcl_UniChar) ch;
+    return (Tcl_UniChar) UCS4ToTitle(ch);
 }

 /*
@@ -1518,8 +1714,9 @@ Tcl_UniCharIsControl(
 {
 #if TCL_UTF_MAX > 3
    if (UNICODE_OUT_OF_RANGE(ch)) {
+	/* Clear away extension bits, if any */
 	ch &= 0x1FFFFF;
-	if ((ch == 0xE0001) || ((ch >= 0xE0020) && (ch <= 0xE007f))) {
+	if ((ch == 0xE0001) || ((ch >= 0xE0020) && (ch <= 0xE007F))) {
 	    return 1;
 	}
 	if ((ch >= 0xF0000) && ((ch & 0xFFFF) <= 0xFFFD)) {
@@ -1581,8 +1778,7 @@ Tcl_UniCharIsGraph(
 {
 #if TCL_UTF_MAX > 3
    if (UNICODE_OUT_OF_RANGE(ch)) {
-	ch &= 0x1FFFFF;
-	return (ch >= 0xE0100) && (ch <= 0xE01EF);
+	return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF);
    }
 #endif
    return ((GRAPH_BITS >> GetCategory(ch)) & 1);
@@ -1638,8 +1834,7 @@ Tcl_UniCharIsPrint(
 {
 #if TCL_UTF_MAX > 3
    if (UNICODE_OUT_OF_RANGE(ch)) {
-	ch &= 0x1FFFFF;
-	return (ch >= 0xE0100) && (ch <= 0xE01EF);
+	return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF);
    }
 #endif
    return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
@@ -1707,7 +1902,7 @@ Tcl_UniCharIsSpace(
     */

    if (ch < 0x80) {
-	return TclIsSpaceProc((char) ch);
+	return TclIsSpaceProcM((char) ch);
 #if TCL_UTF_MAX > 3
    } else if (UNICODE_OUT_OF_RANGE(ch)) {
 	return 0;
@@ -2157,6 +2352,119 @@ TclUniCharMatch(
    }
 }

+/*
+ *---------------------------------------------------------------------------
+ *
+ * TclUtfToUCS4 --
+ *
+ *	Extract the 4-byte codepoint from the leading bytes of the
+ *	Modified UTF-8 string "src".  This is a utility routine to
+ *	contain the surrogate gymnastics in one place.
+ *
+ *	The caller must ensure that the source buffer is long enough that this
+ *	routine does not run off the end and dereference non-existent memory
+ *	looking for trail bytes. If the source buffer is known to be '\0'
+ *	terminated, this cannot happen. Otherwise, the caller should call
+ *	TclUCS4Complete() before calling this routine to ensure that
+ *	enough bytes remain in the string.
+ *
+ * Results:
+ *	*usc4Ptr is filled with the UCS4 code point, and the return value is
+ *	the number of bytes from the UTF-8 string that were consumed.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+TclUtfToUCS4(
+    const char *src,	/* The UTF-8 string. */
+    int *ucs4Ptr)	/* Filled with the UCS4 codepoint represented
+			 * by the UTF-8 string. */
+{
+    Tcl_UniChar ch = 0;
+    int len = Tcl_UtfToUniChar(src, &ch);
+
+#if TCL_UTF_MAX <= 4
+    if ((ch & ~0x3FF) == 0xD800) {
+	Tcl_UniChar low = ch;
+	int len2 = Tcl_UtfToUniChar(src+len, &low);
+	if ((low & ~0x3FF) == 0xDC00) {
+	    *ucs4Ptr = (((ch & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
+	    return len + len2;
+	}
+    }
+#endif
+    *ucs4Ptr = (int)ch;
+    return len;
+}
+
+#if TCL_UTF_MAX == 4
+int
+TclUniCharToUCS4(
+    const Tcl_UniChar *src,	/* The Tcl_UniChar string. */
+    int *ucs4Ptr)	/* Filled with the UCS4 codepoint represented
+			 * by the Tcl_UniChar string. */
+{
+    if (((src[0] & 0xFC00) == 0xD800) && ((src[1] & 0xFC00) == 0xDC00)) {
+	*ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[01] & 0x3FF)) + 0x10000;
+	return 2;
+    }
+    *ucs4Ptr = src[0];
+    return 1;
+}
+#endif
+
+/*
+ *---------------------------------------------------------------------------
+ *
+ * TclUCS4ToUtf --
+ *
+ *	Store the given Unicode character as a sequence of UTF-8 bytes in the
+ *	provided buffer. Might output 6 bytes, if the code point > 0xFFFF.
+ *
+ * Results:
+ *	The return values is the number of bytes in the buffer that were
+ *	consumed. If ch == -1, this function outputs 0 bytes (empty string),
+ *	since TclGetUCS4 returns -1 for out-of-range indices.
+ *
+ * Side effects:
+ *	None.
+ *
+ *---------------------------------------------------------------------------
+ */
+
+int
+TclUCS4ToUtf(
+    int ch,			/* Unicode character to be stored in the
+				 * buffer. */
+    char *buf)			/* Buffer in which the UTF-8 representation of
+				 * the Unicode character is stored. Buffer must be
+				 * large enough to hold the UTF-8 character(s)
+				 * (at most 6 bytes). */
+{
+#if TCL_UTF_MAX <= 4
+    if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
+	/* Spit out a 4-byte UTF-8 character or 2 x 3-byte UTF-8 characters, depending on Tcl
+	 * version and/or TCL_UTF_MAX build value */
+	int len = Tcl_UniCharToUtf(0xD800 | ((ch - 0x10000) >> 10), buf);
+	return len + Tcl_UniCharToUtf(0xDC00 | (ch & 0x7FF), buf + len);
+    }
+#endif
+    if ((ch & ~0x7FF) == 0xD800) {
+	buf[2] = (char) ((ch | 0x80) & 0xBF);
+	buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
+	buf[0] = (char) ((ch >> 12) | 0xE0);
+	return 3;
+    }
+    if (ch == -1) {
+	return 0;
+    }
+    return Tcl_UniCharToUtf(ch, buf);
+}
+
 /*
 * Local Variables:
 * mode: c