Import Tcl 8.6.12

2021-11-08 17:30:58 +00:00
parent 1aadb2455c
commit 674867e7e6
608 changed files with 78089 additions and 60360 deletions
--- a/generic/tclUtf.c
+++ b/generic/tclUtf.c
@@ -90,6 +90,8 @@ static const unsigned char complete[256] = {
 #if TCL_UTF_MAX > 3
    4,4,4,4,4,
 #else
+    /* Tcl_UtfToUniChar() accesses src[1] and src[2] to check whether
+     * the UTF-8 sequence is valid, so we cannot use 1 here. */
    3,3,3,3,3,
 #endif
    1,1,1,1,1,1,1,1,1,1,1
@@ -536,7 +538,7 @@ Tcl_UtfToUniCharDString(
    w = wString;
    p = src;
    endPtr = src + length;
-    optPtr = endPtr - TCL_UTF_MAX;
+    optPtr = endPtr - ((TCL_UTF_MAX > 3) ? 4 : 3) ;
    while (p <= optPtr) {
 	p += TclUtfToUniChar(p, &ch);
 	*w++ = ch;
@@ -623,7 +625,7 @@ Tcl_NumUtfChars(
 	/* Pointer to the end of string. Never read endPtr[0] */
 	const char *endPtr = src + length;
 	/* Pointer to last byte where optimization still can be used */
-	const char *optPtr = endPtr - TCL_UTF_MAX;
+	const char *optPtr = endPtr - ((TCL_UTF_MAX > 3) ? 4 : 3);

 	/*
 	 * Optimize away the call in this loop. Justified because...
@@ -759,6 +761,19 @@ Tcl_UtfNext(
    int left;
    const char *next;

+#if TCL_UTF_MAX > 3
+    if (((*src) & 0xC0) == 0x80) {
+	/* Continuation byte, so we start 'inside' a (possible valid) UTF-8
+	 * sequence. Since we are not allowed to access src[-1], we cannot
+	 * check if the sequence is actually valid, the best we can do is
+	 * just assume it is valid and locate the end. */
+	if ((((*++src) & 0xC0) == 0x80) && (((*++src) & 0xC0) == 0x80)) {
+	    ++src;
+	}
+	return src;
+    }
+#endif
+
    left = totalBytes[UCHAR(*src)];
    next = src + 1;
    while (--left) {
@@ -888,14 +903,13 @@ Tcl_UtfPrev(

 	/* Continue the search backwards... */
 	look--;
-    } while (trailBytesSeen < 3);
+    } while (trailBytesSeen < (TCL_UTF_MAX < 4 ? 3 : 4));

    /*
     * We've seen 3 trail bytes, so we know there will not be a
     * properly formed byte sequence to find, and we can stop looking,
     * accepting the fallback.
     */
-
    return fallback;
 }

@@ -1282,11 +1296,11 @@ Tcl_UtfNcmp(
 	if (ch1 != ch2) {
 #if TCL_UTF_MAX == 4
 	    /* Surrogates always report higher than non-surrogates */
-	    if (((ch1 & ~0x3FF) == 0xD800)) {
-	    if ((ch2 & ~0x3FF) != 0xD800) {
+	    if (((ch1 & 0xFC00) == 0xD800)) {
+	    if ((ch2 & 0xFC00) != 0xD800) {
 		return ch1;
 	    }
-	    } else if ((ch2 & ~0x3FF) == 0xD800) {
+	    } else if ((ch2 & 0xFC00) == 0xD800) {
 		return -ch2;
 	    }
 #endif
@@ -1578,7 +1592,7 @@ Tcl_UniCharNcmp(
    const Tcl_UniChar *uct,	/* Unicode string ucs is compared to. */
    unsigned long numChars)	/* Number of unichars to compare. */
 {
-#ifdef WORDS_BIGENDIAN
+#if defined(WORDS_BIGENDIAN) && (TCL_UTF_MAX != 4)
    /*
     * We are definitely on a big-endian machine; memcmp() is safe
     */
@@ -1592,6 +1606,14 @@ Tcl_UniCharNcmp(

    for ( ; numChars != 0; ucs++, uct++, numChars--) {
 	if (*ucs != *uct) {
+#if TCL_UTF_MAX == 4
+	    /* special case for handling upper surrogates */
+	    if (((*ucs & 0xFC00) == 0xD800) && ((*uct & 0xFC00) != 0xD800)) {
+		return 1;
+	    } else if (((*uct & 0xFC00) == 0xD800)) {
+		return -1;
+	    }
+#endif
 	    return (*ucs - *uct);
 	}
    }
@@ -1629,6 +1651,14 @@ Tcl_UniCharNcasecmp(
 	    Tcl_UniChar lct = Tcl_UniCharToLower(*uct);

 	    if (lcs != lct) {
+#if TCL_UTF_MAX == 4
+	    /* special case for handling upper surrogates */
+	    if (((lcs & 0xFC00) == 0xD800) && ((lct & 0xFC00) != 0xD800)) {
+		return 1;
+	    } else if (((lct & 0xFC00) == 0xD800)) {
+		return -1;
+	    }
+#endif
 		return (lcs - lct);
 	    }
 	}