Import Tcl 8.6.11
This commit is contained in:
642
generic/tclUtf.c
642
generic/tclUtf.c
@@ -66,12 +66,31 @@ static const unsigned char totalBytes[256] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||
#if TCL_UTF_MAX > 3
|
||||
4,4,4,4,4,
|
||||
#else
|
||||
3,3,3,3,3, /* Tcl_UtfCharComplete() only checks TCL_UTF_MAX bytes */
|
||||
1,1,1,1,1,
|
||||
#endif
|
||||
1,1,1,1,1,1,1,1,1,1,1
|
||||
};
|
||||
|
||||
static const unsigned char complete[256] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
/* Tcl_UtfCharComplete() might point to 2nd byte of valid 4-byte sequence */
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||
/* End of "continuation byte section" */
|
||||
2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||
#if TCL_UTF_MAX > 3
|
||||
4,4,4,4,4,
|
||||
#else
|
||||
3,3,3,3,3,
|
||||
#endif
|
||||
1,1,1,1,1,1,1,1,1,1,1
|
||||
};
|
||||
@@ -81,6 +100,9 @@ static const unsigned char totalBytes[256] = {
|
||||
*/
|
||||
|
||||
static int UtfCount(int ch);
|
||||
static int Invalid(const char *src);
|
||||
static int UCS4ToUpper(int ch);
|
||||
static int UCS4ToTitle(int ch);
|
||||
|
||||
/*
|
||||
*---------------------------------------------------------------------------
|
||||
@@ -115,7 +137,68 @@ UtfCount(
|
||||
#endif
|
||||
return 3;
|
||||
}
|
||||
|
||||
/*
|
||||
*---------------------------------------------------------------------------
|
||||
*
|
||||
* Invalid --
|
||||
*
|
||||
* Given a pointer to a two-byte prefix of a well-formed UTF-8 byte
|
||||
* sequence (a lead byte followed by a trail byte) this routine
|
||||
* examines those two bytes to determine whether the sequence is
|
||||
* invalid in UTF-8. This might be because it is an overlong
|
||||
* encoding, or because it encodes something out of the proper range.
|
||||
*
|
||||
* Given a pointer to the bytes \xF8 or \xFC , this routine will
|
||||
* try to read beyond the end of the "bounds" table. Callers must
|
||||
* prevent this.
|
||||
*
|
||||
* Given a pointer to something else (an ASCII byte, a trail byte,
|
||||
* or another byte that can never begin a valid byte sequence such
|
||||
* as \xF5) this routine returns false. That makes the routine poorly
|
||||
* named, as it does not detect and report all invalid sequences.
|
||||
*
|
||||
* Callers have to take care that this routine does something useful
|
||||
* for their needs.
|
||||
*
|
||||
* Results:
|
||||
* A boolean.
|
||||
*---------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
static const unsigned char bounds[28] = {
|
||||
0x80, 0x80, /* \xC0 accepts \x80 only */
|
||||
0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF,
|
||||
0x80, 0xBF, /* (\xC4 - \xDC) -- all sequences valid */
|
||||
0xA0, 0xBF, /* \xE0\x80 through \xE0\x9F are invalid prefixes */
|
||||
0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, /* (\xE4 - \xEC) -- all valid */
|
||||
#if TCL_UTF_MAX > 3
|
||||
0x90, 0xBF, /* \xF0\x80 through \xF0\x8F are invalid prefixes */
|
||||
0x80, 0x8F /* \xF4\x90 and higher are invalid prefixes */
|
||||
#else
|
||||
0xC0, 0xBF, /* Not used, but reject all again for safety. */
|
||||
0xC0, 0xBF /* Not used, but reject all again for safety. */
|
||||
#endif
|
||||
};
|
||||
|
||||
static int
|
||||
Invalid(
|
||||
const char *src) /* Points to lead byte of a UTF-8 byte sequence */
|
||||
{
|
||||
unsigned char byte = UCHAR(*src);
|
||||
int index;
|
||||
|
||||
if ((byte & 0xC3) == 0xC0) {
|
||||
/* Only lead bytes 0xC0, 0xE0, 0xF0, 0xF4 need examination */
|
||||
index = (byte - 0xC0) >> 1;
|
||||
if (UCHAR(src[1]) < bounds[index] || UCHAR(src[1]) > bounds[index+1]) {
|
||||
/* Out of bounds - report invalid. */
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
*---------------------------------------------------------------------------
|
||||
*
|
||||
@@ -278,8 +361,8 @@ Tcl_UniCharToUtfDString(
|
||||
* If TCL_UTF_MAX <= 4, special handling of Surrogate pairs is done:
|
||||
* For any UTF-8 string containing a character outside of the BMP, the
|
||||
* first call to this function will fill *chPtr with the high surrogate
|
||||
* and generate a return value of 0. Calling Tcl_UtfToUniChar again
|
||||
* will produce the low surrogate and a return value of 4. Because *chPtr
|
||||
* and generate a return value of 1. Calling Tcl_UtfToUniChar again
|
||||
* will produce the low surrogate and a return value of 3. Because *chPtr
|
||||
* is used to remember whether the high surrogate is already produced, it
|
||||
* is recommended to initialize the variable it points to as 0 before
|
||||
* the first call to Tcl_UtfToUniChar is done.
|
||||
@@ -296,8 +379,8 @@ Tcl_UniCharToUtfDString(
|
||||
|
||||
int
|
||||
Tcl_UtfToUniChar(
|
||||
register const char *src, /* The UTF-8 string. */
|
||||
register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
|
||||
const char *src, /* The UTF-8 string. */
|
||||
Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
|
||||
* the UTF-8 string. */
|
||||
{
|
||||
Tcl_UniChar byte;
|
||||
@@ -306,7 +389,7 @@ Tcl_UtfToUniChar(
|
||||
* Unroll 1 to 3 (or 4) byte UTF-8 sequences.
|
||||
*/
|
||||
|
||||
byte = *((unsigned char *) src);
|
||||
byte = UCHAR(*src);
|
||||
if (byte < 0xC0) {
|
||||
/*
|
||||
* Handles properly formed UTF-8 characters between 0x01 and 0x7F.
|
||||
@@ -320,10 +403,10 @@ Tcl_UtfToUniChar(
|
||||
* bytes, then we must produce a follow-up low surrogate. We only
|
||||
* do that if the high surrogate matches the bits we encounter.
|
||||
*/
|
||||
if ((byte >= 0x80)
|
||||
if (((byte & 0xC0) == 0x80)
|
||||
&& ((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)
|
||||
&& (((((byte - 0x10) << 2) & 0xFC) | 0xD800) == (*chPtr & 0xFCFC))
|
||||
&& ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))
|
||||
&& ((src[2] & 0xC0) == 0x80)) {
|
||||
&& ((src[1] & 0xF0) == (((*chPtr << 4) & 0x30) | 0x80))) {
|
||||
*chPtr = ((src[1] & 0x0F) << 6) + (src[2] & 0x3F) + 0xDC00;
|
||||
return 3;
|
||||
}
|
||||
@@ -364,26 +447,28 @@ Tcl_UtfToUniChar(
|
||||
* represents itself.
|
||||
*/
|
||||
}
|
||||
else if (byte < 0xF8) {
|
||||
if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) {
|
||||
else if (byte < 0xF5) {
|
||||
if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
|
||||
/*
|
||||
* Four-byte-character lead byte followed by three trail bytes.
|
||||
* Four-byte-character lead byte followed by at least two trail bytes.
|
||||
* We don't test the validity of 3th trail byte, see [ed29806ba]
|
||||
*/
|
||||
#if TCL_UTF_MAX <= 4
|
||||
Tcl_UniChar high = (((byte & 0x07) << 8) | ((src[1] & 0x3F) << 2)
|
||||
| ((src[2] & 0x3F) >> 4)) - 0x40;
|
||||
if (high >= 0x400) {
|
||||
/* out of range, < 0x10000 or > 0x10ffff */
|
||||
} else {
|
||||
if (high < 0x400) {
|
||||
/* produce high surrogate, advance source pointer */
|
||||
*chPtr = 0xD800 + high;
|
||||
return 1;
|
||||
}
|
||||
/* out of range, < 0x10000 or > 0x10FFFF */
|
||||
#else
|
||||
*chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
|
||||
| ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
|
||||
if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
|
||||
return 4;
|
||||
if ((src[3] & 0xC0) == 0x80) {
|
||||
*chPtr = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12)
|
||||
| ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
|
||||
if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) {
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -426,8 +511,12 @@ Tcl_UtfToUniCharDString(
|
||||
* DString. */
|
||||
{
|
||||
Tcl_UniChar ch = 0, *w, *wString;
|
||||
const char *p, *end;
|
||||
const char *p;
|
||||
int oldLength;
|
||||
/* Pointer to the end of string. Never read endPtr[0] */
|
||||
const char *endPtr = src + length;
|
||||
/* Pointer to last byte where optimization still can be used */
|
||||
const char *optPtr = endPtr - TCL_UTF_MAX;
|
||||
|
||||
if (length < 0) {
|
||||
length = strlen(src);
|
||||
@@ -441,28 +530,28 @@ Tcl_UtfToUniCharDString(
|
||||
oldLength = Tcl_DStringLength(dsPtr);
|
||||
|
||||
Tcl_DStringSetLength(dsPtr,
|
||||
oldLength + (int) ((length + 1) * sizeof(Tcl_UniChar)));
|
||||
oldLength + ((length + 1) * sizeof(Tcl_UniChar)));
|
||||
wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
|
||||
|
||||
w = wString;
|
||||
p = src;
|
||||
end = src + length - TCL_UTF_MAX;
|
||||
while (p < end) {
|
||||
endPtr = src + length;
|
||||
optPtr = endPtr - TCL_UTF_MAX;
|
||||
while (p <= optPtr) {
|
||||
p += TclUtfToUniChar(p, &ch);
|
||||
*w++ = ch;
|
||||
}
|
||||
end += TCL_UTF_MAX;
|
||||
while (p < end) {
|
||||
if (Tcl_UtfCharComplete(p, end-p)) {
|
||||
while (p < endPtr) {
|
||||
if (Tcl_UtfCharComplete(p, endPtr-p)) {
|
||||
p += TclUtfToUniChar(p, &ch);
|
||||
*w++ = ch;
|
||||
} else {
|
||||
ch = UCHAR(*p++);
|
||||
*w++ = UCHAR(*p++);
|
||||
}
|
||||
*w++ = ch;
|
||||
}
|
||||
*w = '\0';
|
||||
Tcl_DStringSetLength(dsPtr,
|
||||
(oldLength + ((char *) w - (char *) wString)));
|
||||
oldLength + ((char *) w - (char *) wString));
|
||||
|
||||
return wString;
|
||||
}
|
||||
@@ -492,7 +581,7 @@ Tcl_UtfCharComplete(
|
||||
* a complete UTF-8 character. */
|
||||
int length) /* Length of above string in bytes. */
|
||||
{
|
||||
return length >= totalBytes[(unsigned char)*src];
|
||||
return length >= complete[UCHAR(*src)];
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -515,41 +604,52 @@ Tcl_UtfCharComplete(
|
||||
|
||||
int
|
||||
Tcl_NumUtfChars(
|
||||
register const char *src, /* The UTF-8 string to measure. */
|
||||
int length) /* The length of the string in bytes, or -1
|
||||
* for strlen(string). */
|
||||
const char *src, /* The UTF-8 string to measure. */
|
||||
int length) /* The length of the string in bytes, or -1
|
||||
* for strlen(string). */
|
||||
{
|
||||
Tcl_UniChar ch = 0;
|
||||
register int i = 0;
|
||||
|
||||
/*
|
||||
* The separate implementations are faster.
|
||||
*
|
||||
* Since this is a time-sensitive function, we also do the check for the
|
||||
* single-byte char case specially.
|
||||
*/
|
||||
int i = 0;
|
||||
|
||||
if (length < 0) {
|
||||
while (*src != '\0') {
|
||||
/* string is NUL-terminated, so TclUtfToUniChar calls are safe. */
|
||||
while ((*src != '\0') && (i < INT_MAX)) {
|
||||
src += TclUtfToUniChar(src, &ch);
|
||||
i++;
|
||||
}
|
||||
if (i < 0) i = INT_MAX; /* Bug [2738427] */
|
||||
} else {
|
||||
register const char *endPtr = src + length - TCL_UTF_MAX;
|
||||
/* Will return value between 0 and length. No overflow checks. */
|
||||
|
||||
/* Pointer to the end of string. Never read endPtr[0] */
|
||||
const char *endPtr = src + length;
|
||||
/* Pointer to last byte where optimization still can be used */
|
||||
const char *optPtr = endPtr - TCL_UTF_MAX;
|
||||
|
||||
/*
|
||||
* Optimize away the call in this loop. Justified because...
|
||||
* when (src <= optPtr), (endPtr - src) >= (endPtr - optPtr)
|
||||
* By initialization above (endPtr - optPtr) = TCL_UTF_MAX
|
||||
* So (endPtr - src) >= TCL_UTF_MAX, and passing that to
|
||||
* Tcl_UtfCharComplete we know will cause return of 1.
|
||||
*/
|
||||
while (src <= optPtr
|
||||
/* && Tcl_UtfCharComplete(src, endPtr - src) */ ) {
|
||||
src += TclUtfToUniChar(src, &ch);
|
||||
i++;
|
||||
}
|
||||
/* Loop over the remaining string where call must happen */
|
||||
while (src < endPtr) {
|
||||
src += TclUtfToUniChar(src, &ch);
|
||||
if (Tcl_UtfCharComplete(src, endPtr - src)) {
|
||||
src += TclUtfToUniChar(src, &ch);
|
||||
} else {
|
||||
/*
|
||||
* src points to incomplete UTF-8 sequence
|
||||
* Treat first byte as character and count it
|
||||
*/
|
||||
src++;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
endPtr += TCL_UTF_MAX;
|
||||
while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) {
|
||||
src += TclUtfToUniChar(src, &ch);
|
||||
i++;
|
||||
}
|
||||
if (src < endPtr) {
|
||||
i += endPtr - src;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
@@ -559,7 +659,7 @@ Tcl_NumUtfChars(
|
||||
*
|
||||
* Tcl_UtfFindFirst --
|
||||
*
|
||||
* Returns a pointer to the first occurance of the given Unicode character
|
||||
* Returns a pointer to the first occurrence of the given Unicode character
|
||||
* in the NULL-terminated UTF-8 string. The NULL terminator is considered
|
||||
* part of the UTF-8 string. Equivalent to Plan 9 utfrune().
|
||||
*
|
||||
@@ -578,19 +678,10 @@ Tcl_UtfFindFirst(
|
||||
const char *src, /* The UTF-8 string to be searched. */
|
||||
int ch) /* The Unicode character to search for. */
|
||||
{
|
||||
int len, fullchar;
|
||||
Tcl_UniChar find = 0;
|
||||
|
||||
while (1) {
|
||||
len = TclUtfToUniChar(src, &find);
|
||||
fullchar = find;
|
||||
#if TCL_UTF_MAX <= 4
|
||||
if ((fullchar != ch) && (find >= 0xD800) && (len < 3)) {
|
||||
len += TclUtfToUniChar(src + len, &find);
|
||||
fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
|
||||
}
|
||||
#endif
|
||||
if (fullchar == ch) {
|
||||
int find, len = TclUtfToUCS4(src, &find);
|
||||
|
||||
if (find == ch) {
|
||||
return src;
|
||||
}
|
||||
if (*src == '\0') {
|
||||
@@ -605,7 +696,7 @@ Tcl_UtfFindFirst(
|
||||
*
|
||||
* Tcl_UtfFindLast --
|
||||
*
|
||||
* Returns a pointer to the last occurance of the given Unicode character
|
||||
* Returns a pointer to the last occurrence of the given Unicode character
|
||||
* in the NULL-terminated UTF-8 string. The NULL terminator is considered
|
||||
* part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
|
||||
*
|
||||
@@ -624,21 +715,12 @@ Tcl_UtfFindLast(
|
||||
const char *src, /* The UTF-8 string to be searched. */
|
||||
int ch) /* The Unicode character to search for. */
|
||||
{
|
||||
int len, fullchar;
|
||||
Tcl_UniChar find = 0;
|
||||
const char *last;
|
||||
const char *last = NULL;
|
||||
|
||||
last = NULL;
|
||||
while (1) {
|
||||
len = TclUtfToUniChar(src, &find);
|
||||
fullchar = find;
|
||||
#if TCL_UTF_MAX <= 4
|
||||
if ((fullchar != ch) && (find >= 0xD800) && (len < 3)) {
|
||||
len += TclUtfToUniChar(src + len, &find);
|
||||
fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000;
|
||||
}
|
||||
#endif
|
||||
if (fullchar == ch) {
|
||||
int find, len = TclUtfToUCS4(src, &find);
|
||||
|
||||
if (find == ch) {
|
||||
last = src;
|
||||
}
|
||||
if (*src == '\0') {
|
||||
@@ -654,9 +736,11 @@ Tcl_UtfFindLast(
|
||||
*
|
||||
* Tcl_UtfNext --
|
||||
*
|
||||
* Given a pointer to some current location in a UTF-8 string, move
|
||||
* forward one character. The caller must ensure that they are not asking
|
||||
* for the next character after the last character in the string.
|
||||
* Given a pointer to some location in a UTF-8 string, Tcl_UtfNext
|
||||
* returns a pointer to the next UTF-8 character in the string.
|
||||
* The caller must not ask for the next character after the last
|
||||
* character in the string if the string is not terminated by a null
|
||||
* character.
|
||||
*
|
||||
* Results:
|
||||
* The return value is the pointer to the next character in the UTF-8
|
||||
@@ -672,15 +756,33 @@ const char *
|
||||
Tcl_UtfNext(
|
||||
const char *src) /* The current location in the string. */
|
||||
{
|
||||
Tcl_UniChar ch = 0;
|
||||
int len = TclUtfToUniChar(src, &ch);
|
||||
int left;
|
||||
const char *next;
|
||||
|
||||
#if TCL_UTF_MAX <= 4
|
||||
if ((ch >= 0xD800) && (len < 3)) {
|
||||
len += TclUtfToUniChar(src + len, &ch);
|
||||
left = totalBytes[UCHAR(*src)];
|
||||
next = src + 1;
|
||||
while (--left) {
|
||||
if ((*next & 0xC0) != 0x80) {
|
||||
/*
|
||||
* src points to non-trail byte; We ran out of trail bytes
|
||||
* before the needs of the lead byte were satisfied.
|
||||
* Let the (malformed) lead byte alone be a character
|
||||
*/
|
||||
return src + 1;
|
||||
}
|
||||
next++;
|
||||
}
|
||||
#endif
|
||||
return src + len;
|
||||
/*
|
||||
* Call Invalid() here only if required conditions are met:
|
||||
* src[0] is known a lead byte.
|
||||
* src[1] is known a trail byte.
|
||||
* Especially important to prevent calls when src[0] == '\xF8' or '\xFC'
|
||||
* See tests utf-6.37 through utf-6.43 through valgrind or similar tool.
|
||||
*/
|
||||
if ((next == src + 1) || Invalid(src)) {
|
||||
return src + 1;
|
||||
}
|
||||
return next;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -706,31 +808,95 @@ Tcl_UtfNext(
|
||||
|
||||
const char *
|
||||
Tcl_UtfPrev(
|
||||
const char *src, /* The current location in the string. */
|
||||
const char *start) /* Pointer to the beginning of the string, to
|
||||
* avoid going backwards too far. */
|
||||
const char *src, /* A location in a UTF-8 string. */
|
||||
const char *start) /* Pointer to the beginning of the string */
|
||||
{
|
||||
const char *look;
|
||||
int i, byte;
|
||||
int trailBytesSeen = 0; /* How many trail bytes have been verified? */
|
||||
const char *fallback = src - 1;
|
||||
/* If we cannot find a lead byte that might
|
||||
* start a prefix of a valid UTF byte sequence,
|
||||
* we will fallback to a one-byte back step */
|
||||
const char *look = fallback;
|
||||
/* Start search at the fallback position */
|
||||
|
||||
/* Quick boundary case exit. */
|
||||
if (fallback <= start) {
|
||||
return start;
|
||||
}
|
||||
|
||||
do {
|
||||
unsigned char byte = UCHAR(look[0]);
|
||||
|
||||
look = --src;
|
||||
for (i = 0; i < 4; i++) {
|
||||
if (look < start) {
|
||||
if (src < start) {
|
||||
src = start;
|
||||
}
|
||||
break;
|
||||
}
|
||||
byte = *((unsigned char *) look);
|
||||
if (byte < 0x80) {
|
||||
break;
|
||||
/*
|
||||
* Single byte character. Either this is a correct previous
|
||||
* character, or it is followed by at least one trail byte
|
||||
* which indicates a malformed sequence. In either case the
|
||||
* correct result is to return the fallback.
|
||||
*/
|
||||
return fallback;
|
||||
}
|
||||
if (byte >= 0xC0) {
|
||||
return look;
|
||||
/* Non-trail byte; May be multibyte lead. */
|
||||
|
||||
if ((trailBytesSeen == 0)
|
||||
/*
|
||||
* We've seen no trailing context to use to check
|
||||
* anything. From what we know, this non-trail byte
|
||||
* is a prefix of a previous character, and accepting
|
||||
* it (the fallback) is correct.
|
||||
*/
|
||||
|
||||
|| (trailBytesSeen >= totalBytes[byte])) {
|
||||
/*
|
||||
* That is, (1 + trailBytesSeen > needed).
|
||||
* We've examined more bytes than needed to complete
|
||||
* this lead byte. No matter about well-formedness or
|
||||
* validity, the sequence starting with this lead byte
|
||||
* will never include the fallback location, so we must
|
||||
* return the fallback location. See test utf-7.17
|
||||
*/
|
||||
return fallback;
|
||||
}
|
||||
|
||||
/*
|
||||
* trailBytesSeen > 0, so we can examine look[1] safely.
|
||||
* Use that capability to screen out invalid sequences.
|
||||
*/
|
||||
|
||||
if (Invalid(look)) {
|
||||
/* Reject */
|
||||
return fallback;
|
||||
}
|
||||
return (const char *)look;
|
||||
}
|
||||
|
||||
/* We saw a trail byte. */
|
||||
trailBytesSeen++;
|
||||
|
||||
if ((const char *)look == start) {
|
||||
/*
|
||||
* Do not read before the start of the string
|
||||
*
|
||||
* If we get here, we've examined bytes at every location
|
||||
* >= start and < src and all of them are trail bytes,
|
||||
* including (*start). We need to return our fallback
|
||||
* and exit this loop before we run past the start of the string.
|
||||
*/
|
||||
return fallback;
|
||||
}
|
||||
|
||||
/* Continue the search backwards... */
|
||||
look--;
|
||||
}
|
||||
return src;
|
||||
} while (trailBytesSeen < 3);
|
||||
|
||||
/*
|
||||
* We've seen 3 trail bytes, so we know there will not be a
|
||||
* properly formed byte sequence to find, and we can stop looking,
|
||||
* accepting the fallback.
|
||||
*/
|
||||
|
||||
return fallback;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -752,8 +918,8 @@ Tcl_UtfPrev(
|
||||
|
||||
Tcl_UniChar
|
||||
Tcl_UniCharAtIndex(
|
||||
register const char *src, /* The UTF-8 string to dereference. */
|
||||
register int index) /* The position of the desired character. */
|
||||
const char *src, /* The UTF-8 string to dereference. */
|
||||
int index) /* The position of the desired character. */
|
||||
{
|
||||
Tcl_UniChar ch = 0;
|
||||
|
||||
@@ -782,8 +948,8 @@ Tcl_UniCharAtIndex(
|
||||
|
||||
const char *
|
||||
Tcl_UtfAtIndex(
|
||||
register const char *src, /* The UTF-8 string. */
|
||||
register int index) /* The position of the desired character. */
|
||||
const char *src, /* The UTF-8 string. */
|
||||
int index) /* The position of the desired character. */
|
||||
{
|
||||
Tcl_UniChar ch = 0;
|
||||
int len = 0;
|
||||
@@ -846,7 +1012,7 @@ Tcl_UtfBackslash(
|
||||
* We ate a whole line. Pay the price of a strlen()
|
||||
*/
|
||||
|
||||
result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
|
||||
result = TclParseBackslash(src, strlen(src), &numRead, dst);
|
||||
}
|
||||
if (readPtr != NULL) {
|
||||
*readPtr = numRead;
|
||||
@@ -876,7 +1042,7 @@ int
|
||||
Tcl_UtfToUpper(
|
||||
char *str) /* String to convert in place. */
|
||||
{
|
||||
Tcl_UniChar ch = 0, upChar;
|
||||
int ch, upChar;
|
||||
char *src, *dst;
|
||||
int len;
|
||||
|
||||
@@ -886,8 +1052,8 @@ Tcl_UtfToUpper(
|
||||
|
||||
src = dst = str;
|
||||
while (*src) {
|
||||
len = TclUtfToUniChar(src, &ch);
|
||||
upChar = Tcl_UniCharToUpper(ch);
|
||||
len = TclUtfToUCS4(src, &ch);
|
||||
upChar = UCS4ToUpper(ch);
|
||||
|
||||
/*
|
||||
* To keep badly formed Utf strings from getting inflated by the
|
||||
@@ -899,7 +1065,7 @@ Tcl_UtfToUpper(
|
||||
memmove(dst, src, len);
|
||||
dst += len;
|
||||
} else {
|
||||
dst += Tcl_UniCharToUtf(upChar, dst);
|
||||
dst += TclUCS4ToUtf(upChar, dst);
|
||||
}
|
||||
src += len;
|
||||
}
|
||||
@@ -929,7 +1095,7 @@ int
|
||||
Tcl_UtfToLower(
|
||||
char *str) /* String to convert in place. */
|
||||
{
|
||||
Tcl_UniChar ch = 0, lowChar;
|
||||
int ch, lowChar;
|
||||
char *src, *dst;
|
||||
int len;
|
||||
|
||||
@@ -939,8 +1105,8 @@ Tcl_UtfToLower(
|
||||
|
||||
src = dst = str;
|
||||
while (*src) {
|
||||
len = TclUtfToUniChar(src, &ch);
|
||||
lowChar = Tcl_UniCharToLower(ch);
|
||||
len = TclUtfToUCS4(src, &ch);
|
||||
lowChar = TclUCS4ToLower(ch);
|
||||
|
||||
/*
|
||||
* To keep badly formed Utf strings from getting inflated by the
|
||||
@@ -952,7 +1118,7 @@ Tcl_UtfToLower(
|
||||
memmove(dst, src, len);
|
||||
dst += len;
|
||||
} else {
|
||||
dst += Tcl_UniCharToUtf(lowChar, dst);
|
||||
dst += TclUCS4ToUtf(lowChar, dst);
|
||||
}
|
||||
src += len;
|
||||
}
|
||||
@@ -983,7 +1149,7 @@ int
|
||||
Tcl_UtfToTitle(
|
||||
char *str) /* String to convert in place. */
|
||||
{
|
||||
Tcl_UniChar ch = 0, titleChar, lowChar;
|
||||
int ch, titleChar, lowChar;
|
||||
char *src, *dst;
|
||||
int len;
|
||||
|
||||
@@ -995,30 +1161,30 @@ Tcl_UtfToTitle(
|
||||
src = dst = str;
|
||||
|
||||
if (*src) {
|
||||
len = TclUtfToUniChar(src, &ch);
|
||||
titleChar = Tcl_UniCharToTitle(ch);
|
||||
len = TclUtfToUCS4(src, &ch);
|
||||
titleChar = UCS4ToTitle(ch);
|
||||
|
||||
if (len < UtfCount(titleChar)) {
|
||||
memmove(dst, src, len);
|
||||
dst += len;
|
||||
} else {
|
||||
dst += Tcl_UniCharToUtf(titleChar, dst);
|
||||
dst += TclUCS4ToUtf(titleChar, dst);
|
||||
}
|
||||
src += len;
|
||||
}
|
||||
while (*src) {
|
||||
len = TclUtfToUniChar(src, &ch);
|
||||
len = TclUtfToUCS4(src, &ch);
|
||||
lowChar = ch;
|
||||
/* Special exception for Georgian Asomtavruli chars, no titlecase. */
|
||||
if ((unsigned)(lowChar - 0x1C90) >= 0x30) {
|
||||
lowChar = Tcl_UniCharToLower(lowChar);
|
||||
lowChar = TclUCS4ToLower(lowChar);
|
||||
}
|
||||
|
||||
if (len < UtfCount(lowChar)) {
|
||||
memmove(dst, src, len);
|
||||
dst += len;
|
||||
} else {
|
||||
dst += Tcl_UniCharToUtf(lowChar, dst);
|
||||
dst += TclUCS4ToUtf(lowChar, dst);
|
||||
}
|
||||
src += len;
|
||||
}
|
||||
@@ -1055,7 +1221,7 @@ TclpUtfNcmp2(
|
||||
* fine in the strcmp manner.
|
||||
*/
|
||||
|
||||
register int result = 0;
|
||||
int result = 0;
|
||||
|
||||
for ( ; numBytes != 0; numBytes--, cs++, ct++) {
|
||||
if (*cs != *ct) {
|
||||
@@ -1116,11 +1282,11 @@ Tcl_UtfNcmp(
|
||||
if (ch1 != ch2) {
|
||||
#if TCL_UTF_MAX == 4
|
||||
/* Surrogates always report higher than non-surrogates */
|
||||
if (((ch1 & 0xFC00) == 0xD800)) {
|
||||
if ((ch2 & 0xFC00) != 0xD800) {
|
||||
if (((ch1 & ~0x3FF) == 0xD800)) {
|
||||
if ((ch2 & ~0x3FF) != 0xD800) {
|
||||
return ch1;
|
||||
}
|
||||
} else if ((ch2 & 0xFC00) == 0xD800) {
|
||||
} else if ((ch2 & ~0x3FF) == 0xD800) {
|
||||
return -ch2;
|
||||
}
|
||||
#endif
|
||||
@@ -1251,16 +1417,26 @@ TclUtfCasecmp(
|
||||
*----------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
static int
|
||||
UCS4ToUpper(
|
||||
int ch) /* Unicode character to convert. */
|
||||
{
|
||||
if (!UNICODE_OUT_OF_RANGE(ch)) {
|
||||
int info = GetUniCharInfo(ch);
|
||||
|
||||
if (GetCaseType(info) & 0x04) {
|
||||
ch -= GetDelta(info);
|
||||
}
|
||||
}
|
||||
/* Clear away extension bits, if any */
|
||||
return ch & 0x1FFFFF;
|
||||
}
|
||||
|
||||
Tcl_UniChar
|
||||
Tcl_UniCharToUpper(
|
||||
int ch) /* Unicode character to convert. */
|
||||
{
|
||||
int info = GetUniCharInfo(ch);
|
||||
|
||||
if (GetCaseType(info) & 0x04) {
|
||||
ch -= GetDelta(info);
|
||||
}
|
||||
return (Tcl_UniChar) ch;
|
||||
return (Tcl_UniChar) UCS4ToUpper(ch);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1279,17 +1455,27 @@ Tcl_UniCharToUpper(
|
||||
*----------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
int
|
||||
TclUCS4ToLower(
|
||||
int ch) /* Unicode character to convert. */
|
||||
{
|
||||
if (!UNICODE_OUT_OF_RANGE(ch)) {
|
||||
int info = GetUniCharInfo(ch);
|
||||
int mode = GetCaseType(info);
|
||||
|
||||
if ((mode & 0x02) && (mode != 0x7)) {
|
||||
ch += GetDelta(info);
|
||||
}
|
||||
}
|
||||
/* Clear away extension bits, if any */
|
||||
return ch & 0x1FFFFF;
|
||||
}
|
||||
|
||||
Tcl_UniChar
|
||||
Tcl_UniCharToLower(
|
||||
int ch) /* Unicode character to convert. */
|
||||
{
|
||||
int info = GetUniCharInfo(ch);
|
||||
int mode = GetCaseType(info);
|
||||
|
||||
if ((mode & 0x02) && (mode != 0x7)) {
|
||||
ch += GetDelta(info);
|
||||
}
|
||||
return (Tcl_UniChar) ch;
|
||||
return (Tcl_UniChar) TclUCS4ToLower(ch);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1308,25 +1494,35 @@ Tcl_UniCharToLower(
|
||||
*----------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
static int
|
||||
UCS4ToTitle(
|
||||
int ch) /* Unicode character to convert. */
|
||||
{
|
||||
if (!UNICODE_OUT_OF_RANGE(ch)) {
|
||||
int info = GetUniCharInfo(ch);
|
||||
int mode = GetCaseType(info);
|
||||
|
||||
if (mode & 0x1) {
|
||||
/*
|
||||
* Subtract or add one depending on the original case.
|
||||
*/
|
||||
|
||||
if (mode != 0x7) {
|
||||
ch += ((mode & 0x4) ? -1 : 1);
|
||||
}
|
||||
} else if (mode == 0x4) {
|
||||
ch -= GetDelta(info);
|
||||
}
|
||||
}
|
||||
/* Clear away extension bits, if any */
|
||||
return ch & 0x1FFFFF;
|
||||
}
|
||||
|
||||
Tcl_UniChar
|
||||
Tcl_UniCharToTitle(
|
||||
int ch) /* Unicode character to convert. */
|
||||
{
|
||||
int info = GetUniCharInfo(ch);
|
||||
int mode = GetCaseType(info);
|
||||
|
||||
if (mode & 0x1) {
|
||||
/*
|
||||
* Subtract or add one depending on the original case.
|
||||
*/
|
||||
|
||||
if (mode != 0x7) {
|
||||
ch += ((mode & 0x4) ? -1 : 1);
|
||||
}
|
||||
} else if (mode == 0x4) {
|
||||
ch -= GetDelta(info);
|
||||
}
|
||||
return (Tcl_UniChar) ch;
|
||||
return (Tcl_UniChar) UCS4ToTitle(ch);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1518,8 +1714,9 @@ Tcl_UniCharIsControl(
|
||||
{
|
||||
#if TCL_UTF_MAX > 3
|
||||
if (UNICODE_OUT_OF_RANGE(ch)) {
|
||||
/* Clear away extension bits, if any */
|
||||
ch &= 0x1FFFFF;
|
||||
if ((ch == 0xE0001) || ((ch >= 0xE0020) && (ch <= 0xE007f))) {
|
||||
if ((ch == 0xE0001) || ((ch >= 0xE0020) && (ch <= 0xE007F))) {
|
||||
return 1;
|
||||
}
|
||||
if ((ch >= 0xF0000) && ((ch & 0xFFFF) <= 0xFFFD)) {
|
||||
@@ -1581,8 +1778,7 @@ Tcl_UniCharIsGraph(
|
||||
{
|
||||
#if TCL_UTF_MAX > 3
|
||||
if (UNICODE_OUT_OF_RANGE(ch)) {
|
||||
ch &= 0x1FFFFF;
|
||||
return (ch >= 0xE0100) && (ch <= 0xE01EF);
|
||||
return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF);
|
||||
}
|
||||
#endif
|
||||
return ((GRAPH_BITS >> GetCategory(ch)) & 1);
|
||||
@@ -1638,8 +1834,7 @@ Tcl_UniCharIsPrint(
|
||||
{
|
||||
#if TCL_UTF_MAX > 3
|
||||
if (UNICODE_OUT_OF_RANGE(ch)) {
|
||||
ch &= 0x1FFFFF;
|
||||
return (ch >= 0xE0100) && (ch <= 0xE01EF);
|
||||
return ((unsigned)((ch & 0x1FFFFF) - 0xE0100) <= 0xEF);
|
||||
}
|
||||
#endif
|
||||
return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
|
||||
@@ -1707,7 +1902,7 @@ Tcl_UniCharIsSpace(
|
||||
*/
|
||||
|
||||
if (ch < 0x80) {
|
||||
return TclIsSpaceProc((char) ch);
|
||||
return TclIsSpaceProcM((char) ch);
|
||||
#if TCL_UTF_MAX > 3
|
||||
} else if (UNICODE_OUT_OF_RANGE(ch)) {
|
||||
return 0;
|
||||
@@ -2157,6 +2352,119 @@ TclUniCharMatch(
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
*---------------------------------------------------------------------------
|
||||
*
|
||||
* TclUtfToUCS4 --
|
||||
*
|
||||
* Extract the 4-byte codepoint from the leading bytes of the
|
||||
* Modified UTF-8 string "src". This is a utility routine to
|
||||
* contain the surrogate gymnastics in one place.
|
||||
*
|
||||
* The caller must ensure that the source buffer is long enough that this
|
||||
* routine does not run off the end and dereference non-existent memory
|
||||
* looking for trail bytes. If the source buffer is known to be '\0'
|
||||
* terminated, this cannot happen. Otherwise, the caller should call
|
||||
* TclUCS4Complete() before calling this routine to ensure that
|
||||
* enough bytes remain in the string.
|
||||
*
|
||||
* Results:
|
||||
* *usc4Ptr is filled with the UCS4 code point, and the return value is
|
||||
* the number of bytes from the UTF-8 string that were consumed.
|
||||
*
|
||||
* Side effects:
|
||||
* None.
|
||||
*
|
||||
*---------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
int
|
||||
TclUtfToUCS4(
|
||||
const char *src, /* The UTF-8 string. */
|
||||
int *ucs4Ptr) /* Filled with the UCS4 codepoint represented
|
||||
* by the UTF-8 string. */
|
||||
{
|
||||
Tcl_UniChar ch = 0;
|
||||
int len = Tcl_UtfToUniChar(src, &ch);
|
||||
|
||||
#if TCL_UTF_MAX <= 4
|
||||
if ((ch & ~0x3FF) == 0xD800) {
|
||||
Tcl_UniChar low = ch;
|
||||
int len2 = Tcl_UtfToUniChar(src+len, &low);
|
||||
if ((low & ~0x3FF) == 0xDC00) {
|
||||
*ucs4Ptr = (((ch & 0x3FF) << 10) | (low & 0x3FF)) + 0x10000;
|
||||
return len + len2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
*ucs4Ptr = (int)ch;
|
||||
return len;
|
||||
}
|
||||
|
||||
#if TCL_UTF_MAX == 4
|
||||
int
|
||||
TclUniCharToUCS4(
|
||||
const Tcl_UniChar *src, /* The Tcl_UniChar string. */
|
||||
int *ucs4Ptr) /* Filled with the UCS4 codepoint represented
|
||||
* by the Tcl_UniChar string. */
|
||||
{
|
||||
if (((src[0] & 0xFC00) == 0xD800) && ((src[1] & 0xFC00) == 0xDC00)) {
|
||||
*ucs4Ptr = (((src[0] & 0x3FF) << 10) | (src[01] & 0x3FF)) + 0x10000;
|
||||
return 2;
|
||||
}
|
||||
*ucs4Ptr = src[0];
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
*---------------------------------------------------------------------------
|
||||
*
|
||||
* TclUCS4ToUtf --
|
||||
*
|
||||
* Store the given Unicode character as a sequence of UTF-8 bytes in the
|
||||
* provided buffer. Might output 6 bytes, if the code point > 0xFFFF.
|
||||
*
|
||||
* Results:
|
||||
* The return values is the number of bytes in the buffer that were
|
||||
* consumed. If ch == -1, this function outputs 0 bytes (empty string),
|
||||
* since TclGetUCS4 returns -1 for out-of-range indices.
|
||||
*
|
||||
* Side effects:
|
||||
* None.
|
||||
*
|
||||
*---------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
int
|
||||
TclUCS4ToUtf(
|
||||
int ch, /* Unicode character to be stored in the
|
||||
* buffer. */
|
||||
char *buf) /* Buffer in which the UTF-8 representation of
|
||||
* the Unicode character is stored. Buffer must be
|
||||
* large enough to hold the UTF-8 character(s)
|
||||
* (at most 6 bytes). */
|
||||
{
|
||||
#if TCL_UTF_MAX <= 4
|
||||
if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
|
||||
/* Spit out a 4-byte UTF-8 character or 2 x 3-byte UTF-8 characters, depending on Tcl
|
||||
* version and/or TCL_UTF_MAX build value */
|
||||
int len = Tcl_UniCharToUtf(0xD800 | ((ch - 0x10000) >> 10), buf);
|
||||
return len + Tcl_UniCharToUtf(0xDC00 | (ch & 0x7FF), buf + len);
|
||||
}
|
||||
#endif
|
||||
if ((ch & ~0x7FF) == 0xD800) {
|
||||
buf[2] = (char) ((ch | 0x80) & 0xBF);
|
||||
buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
|
||||
buf[0] = (char) ((ch >> 12) | 0xE0);
|
||||
return 3;
|
||||
}
|
||||
if (ch == -1) {
|
||||
return 0;
|
||||
}
|
||||
return Tcl_UniCharToUtf(ch, buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Local Variables:
|
||||
* mode: c
|
||||
|
||||
Reference in New Issue
Block a user