diff --git a/src/raylib.h b/src/raylib.h index 9bc7c2fa8..6c1e36f09 100644 --- a/src/raylib.h +++ b/src/raylib.h @@ -1188,14 +1188,11 @@ RLAPI void DrawTextRecEx(Font font, const char *text, Rectangle rec, float fontS RLAPI int MeasureText(const char *text, int fontSize); // Measure string width for default font RLAPI Vector2 MeasureTextEx(Font font, const char *text, float fontSize, float spacing); // Measure string size for Font RLAPI int GetGlyphIndex(Font font, int character); // Get index position for a unicode character on font -RLAPI int GetNextCodepoint(const char *text, int *bytesProcessed); // Returns next codepoint in a UTF8 encoded string; 0x3f('?') is returned on failure -RLAPI int *GetCodepoints(const char *text, int *count); // Get all codepoints in a string, codepoints count returned by parameters -// Text strings management functions +// Text strings management functions (no utf8 strings, only byte chars) // NOTE: Some strings allocate memory internally for returned strings, just be careful! RLAPI bool TextIsEqual(const char *text1, const char *text2); // Check if two text string are equal RLAPI unsigned int TextLength(const char *text); // Get text length, checks for '\0' ending -RLAPI unsigned int TextCountCodepoints(const char *text); // Get total number of characters (codepoints) in a UTF8 encoded string RLAPI const char *TextFormat(const char *text, ...); // Text formatting with variables (sprintf style) RLAPI const char *TextSubtext(const char *text, int position, int length); // Get a piece of a text string RLAPI char *TextReplace(char *text, const char *replace, const char *by); // Replace text string (memory should be freed!) @@ -1208,6 +1205,12 @@ RLAPI const char *TextToUpper(const char *text); // Get upp RLAPI const char *TextToLower(const char *text); // Get lower case version of provided string RLAPI const char *TextToPascal(const char *text); // Get Pascal case notation version of provided string RLAPI int TextToInteger(const char *text); // Get integer value from text (negative values not supported) +RLAPI const char *TextToUtf8(int codepoint, int *byteLength); // Encode codepoint into utf8 text (char array length returned as parameter) + +// UTF8 text strings management functions +RLAPI int *GetCodepoints(const char *text, int *count); // Get all codepoints in a string, codepoints count returned by parameters +RLAPI int GetCodepointsCount(const char *text); // Get total number of characters (codepoints) in a UTF8 encoded string +RLAPI int GetNextCodepoint(const char *text, int *bytesProcessed); // Returns next codepoint in a UTF8 encoded string; 0x3f('?') is returned on failure //------------------------------------------------------------------------------------ // Basic 3d Shapes Drawing Functions (Module: models) diff --git a/src/text.c b/src/text.c index 09295ab7b..66d3a7257 100644 --- a/src/text.c +++ b/src/text.c @@ -777,140 +777,6 @@ void DrawFPS(int posX, int posY) DrawText(TextFormat("%2i FPS", fps), posX, posY, 20, LIME); } -// Returns next codepoint in a UTF8 encoded text, scanning until '\0' is found -// When a invalid UTF8 byte is encountered we exit as soon as possible and a '?'(0x3f) codepoint is returned -// Total number of bytes processed are returned as a parameter -// NOTE: the standard says U+FFFD should be returned in case of errors -// but that character is not supported by the default font in raylib -// TODO: optimize this code for speed!! -int GetNextCodepoint(const char *text, int *bytesProcessed) -{ -/* - UTF8 specs from https://www.ietf.org/rfc/rfc3629.txt - - Char. number range | UTF-8 octet sequence - (hexadecimal) | (binary) - --------------------+--------------------------------------------- - 0000 0000-0000 007F | 0xxxxxxx - 0000 0080-0000 07FF | 110xxxxx 10xxxxxx - 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx - 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -*/ - - // NOTE: on decode errors we return as soon as possible - - int code = 0x3f; // Codepoint (defaults to '?') - int octet = (unsigned char)(text[0]); // The first UTF8 octet - *bytesProcessed = 1; - - if (octet <= 0x7f) - { - // Only one octet (ASCII range x00-7F) - code = text[0]; - } - else if ((octet & 0xe0) == 0xc0) - { - // Two octets - // [0]xC2-DF [1]UTF8-tail(x80-BF) - unsigned char octet1 = text[1]; - - if ((octet1 == '\0') || ((octet1 >> 6) != 2)) { *bytesProcessed = 2; return code; } // Unexpected sequence - - if ((octet >= 0xc2) && (octet <= 0xdf)) - { - code = ((octet & 0x1f) << 6) | (octet1 & 0x3f); - *bytesProcessed = 2; - } - } - else if ((octet & 0xf0) == 0xe0) - { - // Three octets - unsigned char octet1 = text[1]; - unsigned char octet2 = '\0'; - - if ((octet1 == '\0') || ((octet1 >> 6) != 2)) { *bytesProcessed = 2; return code; } // Unexpected sequence - - octet2 = text[2]; - - if ((octet2 == '\0') || ((octet2 >> 6) != 2)) { *bytesProcessed = 3; return code; } // Unexpected sequence - - /* - [0]xE0 [1]xA0-BF [2]UTF8-tail(x80-BF) - [0]xE1-EC [1]UTF8-tail [2]UTF8-tail(x80-BF) - [0]xED [1]x80-9F [2]UTF8-tail(x80-BF) - [0]xEE-EF [1]UTF8-tail [2]UTF8-tail(x80-BF) - */ - - if (((octet == 0xe0) && !((octet1 >= 0xa0) && (octet1 <= 0xbf))) || - ((octet == 0xed) && !((octet1 >= 0x80) && (octet1 <= 0x9f)))) { *bytesProcessed = 2; return code; } - - if ((octet >= 0xe0) && (0 <= 0xef)) - { - code = ((octet & 0xf) << 12) | ((octet1 & 0x3f) << 6) | (octet2 & 0x3f); - *bytesProcessed = 3; - } - } - else if ((octet & 0xf8) == 0xf0) - { - // Four octets - if (octet > 0xf4) return code; - - unsigned char octet1 = text[1]; - unsigned char octet2 = '\0'; - unsigned char octet3 = '\0'; - - if ((octet1 == '\0') || ((octet1 >> 6) != 2)) { *bytesProcessed = 2; return code; } // Unexpected sequence - - octet2 = text[2]; - - if ((octet2 == '\0') || ((octet2 >> 6) != 2)) { *bytesProcessed = 3; return code; } // Unexpected sequence - - octet3 = text[3]; - - if ((octet3 == '\0') || ((octet3 >> 6) != 2)) { *bytesProcessed = 4; return code; } // Unexpected sequence - - /* - [0]xF0 [1]x90-BF [2]UTF8-tail [3]UTF8-tail - [0]xF1-F3 [1]UTF8-tail [2]UTF8-tail [3]UTF8-tail - [0]xF4 [1]x80-8F [2]UTF8-tail [3]UTF8-tail - */ - - if (((octet == 0xf0) && !((octet1 >= 0x90) && (octet1 <= 0xbf))) || - ((octet == 0xf4) && !((octet1 >= 0x80) && (octet1 <= 0x8f)))) { *bytesProcessed = 2; return code; } // Unexpected sequence - - if (octet >= 0xf0) - { - code = ((octet & 0x7) << 18) | ((octet1 & 0x3f) << 12) | ((octet2 & 0x3f) << 6) | (octet3 & 0x3f); - *bytesProcessed = 4; - } - } - - if (code > 0x10ffff) code = 0x3f; // Codepoints after U+10ffff are invalid - - return code; -} - -// Get all codepoints in a string, codepoints count returned by parameters -int *GetCodepoints(const char *text, int *count) -{ - static int codepoints[MAX_TEXT_UNICODE_CHARS] = { 0 }; - memset(codepoints, 0, MAX_TEXT_UNICODE_CHARS*sizeof(int)); - - int bytesProcessed = 0; - int textLength = strlen(text); - int codepointsCount = 0; - - for (int i = 0; i < textLength; codepointsCount++) - { - codepoints[codepointsCount] = GetNextCodepoint(text + i, &bytesProcessed); - i += bytesProcessed; - } - - *count = codepointsCount; - - return codepoints; -} - // Draw text (using default font) // NOTE: fontSize work like in any drawing program but if fontSize is lower than font-base-size, then font-base-size is used // NOTE: chars spacing is proportional to fontSize @@ -1235,27 +1101,6 @@ unsigned int TextLength(const char *text) return length; } -// Returns total number of characters(codepoints) in a UTF8 encoded text, until '\0' is found -// NOTE: If an invalid UTF8 sequence is encountered a '?'(0x3f) codepoint is counted instead -unsigned int TextCountCodepoints(const char *text) -{ - unsigned int len = 0; - char *ptr = (char *)&text[0]; - - while (*ptr != '\0') - { - int next = 0; - int letter = GetNextCodepoint(ptr, &next); - - if (letter == 0x3f) ptr += 1; - else ptr += next; - - len++; - } - - return len; -} - // Formatting of text with variables to 'embed' const char *TextFormat(const char *text, ...) { @@ -1531,6 +1376,200 @@ int TextToInteger(const char *text) return result; } + +// Encode codepoint into utf8 text (char array length returned as parameter) +RLAPI const char *TextToUtf8(int codepoint, int *byteLength) +{ + static char utf8[6] = { 0 }; + int length = 0; + + if (codepoint <= 0x7f) + { + utf8[0] = (char)codepoint; + length = 1; + } + else if (codepoint <= 0x7ff) + { + utf8[0] = (char)(((codepoint >> 6) & 0x1f) | 0xc0); + utf8[1] = (char)((codepoint & 0x3f) | 0x80); + length = 2; + } + else if (codepoint <= 0xffff) + { + utf8[0] = (char)(((codepoint >> 12) & 0x0f) | 0xe0); + utf8[1] = (char)(((codepoint >> 6) & 0x3f) | 0x80); + utf8[2] = (char)((codepoint & 0x3f) | 0x80); + length = 3; + } + else if (codepoint <= 0x10ffff) + { + utf8[0] = (char)(((codepoint >> 18) & 0x07) | 0xf0); + utf8[1] = (char)(((codepoint >> 12) & 0x3f) | 0x80); + utf8[2] = (char)(((codepoint >> 6) & 0x3f) | 0x80); + utf8[3] = (char)((codepoint & 0x3f) | 0x80); + length = 4; + } + + *byteLength = length; + + return utf8; +} + + +// Get all codepoints in a string, codepoints count returned by parameters +int *GetCodepoints(const char *text, int *count) +{ + static int codepoints[MAX_TEXT_UNICODE_CHARS] = { 0 }; + memset(codepoints, 0, MAX_TEXT_UNICODE_CHARS*sizeof(int)); + + int bytesProcessed = 0; + int textLength = strlen(text); + int codepointsCount = 0; + + for (int i = 0; i < textLength; codepointsCount++) + { + codepoints[codepointsCount] = GetNextCodepoint(text + i, &bytesProcessed); + i += bytesProcessed; + } + + *count = codepointsCount; + + return codepoints; +} + +// Returns total number of characters(codepoints) in a UTF8 encoded text, until '\0' is found +// NOTE: If an invalid UTF8 sequence is encountered a '?'(0x3f) codepoint is counted instead +int GetCodepointsCount(const char *text) +{ + unsigned int len = 0; + char *ptr = (char *)&text[0]; + + while (*ptr != '\0') + { + int next = 0; + int letter = GetNextCodepoint(ptr, &next); + + if (letter == 0x3f) ptr += 1; + else ptr += next; + + len++; + } + + return len; +} + + +// Returns next codepoint in a UTF8 encoded text, scanning until '\0' is found +// When a invalid UTF8 byte is encountered we exit as soon as possible and a '?'(0x3f) codepoint is returned +// Total number of bytes processed are returned as a parameter +// NOTE: the standard says U+FFFD should be returned in case of errors +// but that character is not supported by the default font in raylib +// TODO: optimize this code for speed!! +int GetNextCodepoint(const char *text, int *bytesProcessed) +{ +/* + UTF8 specs from https://www.ietf.org/rfc/rfc3629.txt + + Char. number range | UTF-8 octet sequence + (hexadecimal) | (binary) + --------------------+--------------------------------------------- + 0000 0000-0000 007F | 0xxxxxxx + 0000 0080-0000 07FF | 110xxxxx 10xxxxxx + 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx + 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx +*/ + // NOTE: on decode errors we return as soon as possible + + int code = 0x3f; // Codepoint (defaults to '?') + int octet = (unsigned char)(text[0]); // The first UTF8 octet + *bytesProcessed = 1; + + if (octet <= 0x7f) + { + // Only one octet (ASCII range x00-7F) + code = text[0]; + } + else if ((octet & 0xe0) == 0xc0) + { + // Two octets + // [0]xC2-DF [1]UTF8-tail(x80-BF) + unsigned char octet1 = text[1]; + + if ((octet1 == '\0') || ((octet1 >> 6) != 2)) { *bytesProcessed = 2; return code; } // Unexpected sequence + + if ((octet >= 0xc2) && (octet <= 0xdf)) + { + code = ((octet & 0x1f) << 6) | (octet1 & 0x3f); + *bytesProcessed = 2; + } + } + else if ((octet & 0xf0) == 0xe0) + { + // Three octets + unsigned char octet1 = text[1]; + unsigned char octet2 = '\0'; + + if ((octet1 == '\0') || ((octet1 >> 6) != 2)) { *bytesProcessed = 2; return code; } // Unexpected sequence + + octet2 = text[2]; + + if ((octet2 == '\0') || ((octet2 >> 6) != 2)) { *bytesProcessed = 3; return code; } // Unexpected sequence + + /* + [0]xE0 [1]xA0-BF [2]UTF8-tail(x80-BF) + [0]xE1-EC [1]UTF8-tail [2]UTF8-tail(x80-BF) + [0]xED [1]x80-9F [2]UTF8-tail(x80-BF) + [0]xEE-EF [1]UTF8-tail [2]UTF8-tail(x80-BF) + */ + + if (((octet == 0xe0) && !((octet1 >= 0xa0) && (octet1 <= 0xbf))) || + ((octet == 0xed) && !((octet1 >= 0x80) && (octet1 <= 0x9f)))) { *bytesProcessed = 2; return code; } + + if ((octet >= 0xe0) && (0 <= 0xef)) + { + code = ((octet & 0xf) << 12) | ((octet1 & 0x3f) << 6) | (octet2 & 0x3f); + *bytesProcessed = 3; + } + } + else if ((octet & 0xf8) == 0xf0) + { + // Four octets + if (octet > 0xf4) return code; + + unsigned char octet1 = text[1]; + unsigned char octet2 = '\0'; + unsigned char octet3 = '\0'; + + if ((octet1 == '\0') || ((octet1 >> 6) != 2)) { *bytesProcessed = 2; return code; } // Unexpected sequence + + octet2 = text[2]; + + if ((octet2 == '\0') || ((octet2 >> 6) != 2)) { *bytesProcessed = 3; return code; } // Unexpected sequence + + octet3 = text[3]; + + if ((octet3 == '\0') || ((octet3 >> 6) != 2)) { *bytesProcessed = 4; return code; } // Unexpected sequence + + /* + [0]xF0 [1]x90-BF [2]UTF8-tail [3]UTF8-tail + [0]xF1-F3 [1]UTF8-tail [2]UTF8-tail [3]UTF8-tail + [0]xF4 [1]x80-8F [2]UTF8-tail [3]UTF8-tail + */ + + if (((octet == 0xf0) && !((octet1 >= 0x90) && (octet1 <= 0xbf))) || + ((octet == 0xf4) && !((octet1 >= 0x80) && (octet1 <= 0x8f)))) { *bytesProcessed = 2; return code; } // Unexpected sequence + + if (octet >= 0xf0) + { + code = ((octet & 0x7) << 18) | ((octet1 & 0x3f) << 12) | ((octet2 & 0x3f) << 6) | (octet3 & 0x3f); + *bytesProcessed = 4; + } + } + + if (code > 0x10ffff) code = 0x3f; // Codepoints after U+10ffff are invalid + + return code; +} //---------------------------------------------------------------------------------- //----------------------------------------------------------------------------------