1 /**
2  * Functions related to UTF encoding.
3  *
4  * Copyright:   Copyright (C) 1999-2023 by The D Language Foundation, All Rights Reserved
5  * Authors:     $(LINK2 https://www.digitalmars.com, Walter Bright)
6  * License:     $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/root/utf.d, _utf.d)
8  * Documentation:  https://dlang.org/phobos/dmd_root_utf.html
9  * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/root/utf.d
10  */
11 
12 module dmd.root.utf;
13 
14 @nogc nothrow pure @safe:
15 
16 /// The Unicode code space is the range of code points [0x000000,0x10FFFF]
17 /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF]
18 bool utf_isValidDchar(dchar c)
19 {
20     // TODO: Whether non-char code points should be rejected is pending review.
21     // 0xFFFE and 0xFFFF are valid for internal use, like Phobos std.utf.isValidDChar
22     // See also https://issues.dlang.org/show_bug.cgi?id=1357
23     if (c < 0xD800) // Almost all characters in a typical document.
24         return true;
25     if (c > 0xDFFF && c <= 0x10FFFF)
26         return true;
27     return false;
28 }
29 
30 /*******************************
31  * Return !=0 if unicode alpha.
32  * Use table from C99 Appendix D.
33  */
34 bool isUniAlpha(dchar c)
35 {
36     static immutable wchar[2][] ALPHA_TABLE =
37     [
38         [0x00AA, 0x00AA],
39         [0x00B5, 0x00B5],
40         [0x00B7, 0x00B7],
41         [0x00BA, 0x00BA],
42         [0x00C0, 0x00D6],
43         [0x00D8, 0x00F6],
44         [0x00F8, 0x01F5],
45         [0x01FA, 0x0217],
46         [0x0250, 0x02A8],
47         [0x02B0, 0x02B8],
48         [0x02BB, 0x02BB],
49         [0x02BD, 0x02C1],
50         [0x02D0, 0x02D1],
51         [0x02E0, 0x02E4],
52         [0x037A, 0x037A],
53         [0x0386, 0x0386],
54         [0x0388, 0x038A],
55         [0x038C, 0x038C],
56         [0x038E, 0x03A1],
57         [0x03A3, 0x03CE],
58         [0x03D0, 0x03D6],
59         [0x03DA, 0x03DA],
60         [0x03DC, 0x03DC],
61         [0x03DE, 0x03DE],
62         [0x03E0, 0x03E0],
63         [0x03E2, 0x03F3],
64         [0x0401, 0x040C],
65         [0x040E, 0x044F],
66         [0x0451, 0x045C],
67         [0x045E, 0x0481],
68         [0x0490, 0x04C4],
69         [0x04C7, 0x04C8],
70         [0x04CB, 0x04CC],
71         [0x04D0, 0x04EB],
72         [0x04EE, 0x04F5],
73         [0x04F8, 0x04F9],
74         [0x0531, 0x0556],
75         [0x0559, 0x0559],
76         [0x0561, 0x0587],
77         [0x05B0, 0x05B9],
78         [0x05BB, 0x05BD],
79         [0x05BF, 0x05BF],
80         [0x05C1, 0x05C2],
81         [0x05D0, 0x05EA],
82         [0x05F0, 0x05F2],
83         [0x0621, 0x063A],
84         [0x0640, 0x0652],
85         [0x0660, 0x0669],
86         [0x0670, 0x06B7],
87         [0x06BA, 0x06BE],
88         [0x06C0, 0x06CE],
89         [0x06D0, 0x06DC],
90         [0x06E5, 0x06E8],
91         [0x06EA, 0x06ED],
92         [0x06F0, 0x06F9],
93         [0x0901, 0x0903],
94         [0x0905, 0x0939],
95         [0x093D, 0x094D],
96         [0x0950, 0x0952],
97         [0x0958, 0x0963],
98         [0x0966, 0x096F],
99         [0x0981, 0x0983],
100         [0x0985, 0x098C],
101         [0x098F, 0x0990],
102         [0x0993, 0x09A8],
103         [0x09AA, 0x09B0],
104         [0x09B2, 0x09B2],
105         [0x09B6, 0x09B9],
106         [0x09BE, 0x09C4],
107         [0x09C7, 0x09C8],
108         [0x09CB, 0x09CD],
109         [0x09DC, 0x09DD],
110         [0x09DF, 0x09E3],
111         [0x09E6, 0x09F1],
112         [0x0A02, 0x0A02],
113         [0x0A05, 0x0A0A],
114         [0x0A0F, 0x0A10],
115         [0x0A13, 0x0A28],
116         [0x0A2A, 0x0A30],
117         [0x0A32, 0x0A33],
118         [0x0A35, 0x0A36],
119         [0x0A38, 0x0A39],
120         [0x0A3E, 0x0A42],
121         [0x0A47, 0x0A48],
122         [0x0A4B, 0x0A4D],
123         [0x0A59, 0x0A5C],
124         [0x0A5E, 0x0A5E],
125         [0x0A66, 0x0A6F],
126         [0x0A74, 0x0A74],
127         [0x0A81, 0x0A83],
128         [0x0A85, 0x0A8B],
129         [0x0A8D, 0x0A8D],
130         [0x0A8F, 0x0A91],
131         [0x0A93, 0x0AA8],
132         [0x0AAA, 0x0AB0],
133         [0x0AB2, 0x0AB3],
134         [0x0AB5, 0x0AB9],
135         [0x0ABD, 0x0AC5],
136         [0x0AC7, 0x0AC9],
137         [0x0ACB, 0x0ACD],
138         [0x0AD0, 0x0AD0],
139         [0x0AE0, 0x0AE0],
140         [0x0AE6, 0x0AEF],
141         [0x0B01, 0x0B03],
142         [0x0B05, 0x0B0C],
143         [0x0B0F, 0x0B10],
144         [0x0B13, 0x0B28],
145         [0x0B2A, 0x0B30],
146         [0x0B32, 0x0B33],
147         [0x0B36, 0x0B39],
148         [0x0B3D, 0x0B43],
149         [0x0B47, 0x0B48],
150         [0x0B4B, 0x0B4D],
151         [0x0B5C, 0x0B5D],
152         [0x0B5F, 0x0B61],
153         [0x0B66, 0x0B6F],
154         [0x0B82, 0x0B83],
155         [0x0B85, 0x0B8A],
156         [0x0B8E, 0x0B90],
157         [0x0B92, 0x0B95],
158         [0x0B99, 0x0B9A],
159         [0x0B9C, 0x0B9C],
160         [0x0B9E, 0x0B9F],
161         [0x0BA3, 0x0BA4],
162         [0x0BA8, 0x0BAA],
163         [0x0BAE, 0x0BB5],
164         [0x0BB7, 0x0BB9],
165         [0x0BBE, 0x0BC2],
166         [0x0BC6, 0x0BC8],
167         [0x0BCA, 0x0BCD],
168         [0x0BE7, 0x0BEF],
169         [0x0C01, 0x0C03],
170         [0x0C05, 0x0C0C],
171         [0x0C0E, 0x0C10],
172         [0x0C12, 0x0C28],
173         [0x0C2A, 0x0C33],
174         [0x0C35, 0x0C39],
175         [0x0C3E, 0x0C44],
176         [0x0C46, 0x0C48],
177         [0x0C4A, 0x0C4D],
178         [0x0C60, 0x0C61],
179         [0x0C66, 0x0C6F],
180         [0x0C82, 0x0C83],
181         [0x0C85, 0x0C8C],
182         [0x0C8E, 0x0C90],
183         [0x0C92, 0x0CA8],
184         [0x0CAA, 0x0CB3],
185         [0x0CB5, 0x0CB9],
186         [0x0CBE, 0x0CC4],
187         [0x0CC6, 0x0CC8],
188         [0x0CCA, 0x0CCD],
189         [0x0CDE, 0x0CDE],
190         [0x0CE0, 0x0CE1],
191         [0x0CE6, 0x0CEF],
192         [0x0D02, 0x0D03],
193         [0x0D05, 0x0D0C],
194         [0x0D0E, 0x0D10],
195         [0x0D12, 0x0D28],
196         [0x0D2A, 0x0D39],
197         [0x0D3E, 0x0D43],
198         [0x0D46, 0x0D48],
199         [0x0D4A, 0x0D4D],
200         [0x0D60, 0x0D61],
201         [0x0D66, 0x0D6F],
202         [0x0E01, 0x0E3A],
203         [0x0E40, 0x0E5B],
204         [0x0E81, 0x0E82],
205         [0x0E84, 0x0E84],
206         [0x0E87, 0x0E88],
207         [0x0E8A, 0x0E8A],
208         [0x0E8D, 0x0E8D],
209         [0x0E94, 0x0E97],
210         [0x0E99, 0x0E9F],
211         [0x0EA1, 0x0EA3],
212         [0x0EA5, 0x0EA5],
213         [0x0EA7, 0x0EA7],
214         [0x0EAA, 0x0EAB],
215         [0x0EAD, 0x0EAE],
216         [0x0EB0, 0x0EB9],
217         [0x0EBB, 0x0EBD],
218         [0x0EC0, 0x0EC4],
219         [0x0EC6, 0x0EC6],
220         [0x0EC8, 0x0ECD],
221         [0x0ED0, 0x0ED9],
222         [0x0EDC, 0x0EDD],
223         [0x0F00, 0x0F00],
224         [0x0F18, 0x0F19],
225         [0x0F20, 0x0F33],
226         [0x0F35, 0x0F35],
227         [0x0F37, 0x0F37],
228         [0x0F39, 0x0F39],
229         [0x0F3E, 0x0F47],
230         [0x0F49, 0x0F69],
231         [0x0F71, 0x0F84],
232         [0x0F86, 0x0F8B],
233         [0x0F90, 0x0F95],
234         [0x0F97, 0x0F97],
235         [0x0F99, 0x0FAD],
236         [0x0FB1, 0x0FB7],
237         [0x0FB9, 0x0FB9],
238         [0x10A0, 0x10C5],
239         [0x10D0, 0x10F6],
240         [0x1E00, 0x1E9B],
241         [0x1EA0, 0x1EF9],
242         [0x1F00, 0x1F15],
243         [0x1F18, 0x1F1D],
244         [0x1F20, 0x1F45],
245         [0x1F48, 0x1F4D],
246         [0x1F50, 0x1F57],
247         [0x1F59, 0x1F59],
248         [0x1F5B, 0x1F5B],
249         [0x1F5D, 0x1F5D],
250         [0x1F5F, 0x1F7D],
251         [0x1F80, 0x1FB4],
252         [0x1FB6, 0x1FBC],
253         [0x1FBE, 0x1FBE],
254         [0x1FC2, 0x1FC4],
255         [0x1FC6, 0x1FCC],
256         [0x1FD0, 0x1FD3],
257         [0x1FD6, 0x1FDB],
258         [0x1FE0, 0x1FEC],
259         [0x1FF2, 0x1FF4],
260         [0x1FF6, 0x1FFC],
261         [0x203F, 0x2040],
262         [0x207F, 0x207F],
263         [0x2102, 0x2102],
264         [0x2107, 0x2107],
265         [0x210A, 0x2113],
266         [0x2115, 0x2115],
267         [0x2118, 0x211D],
268         [0x2124, 0x2124],
269         [0x2126, 0x2126],
270         [0x2128, 0x2128],
271         [0x212A, 0x2131],
272         [0x2133, 0x2138],
273         [0x2160, 0x2182],
274         [0x3005, 0x3007],
275         [0x3021, 0x3029],
276         [0x3041, 0x3093],
277         [0x309B, 0x309C],
278         [0x30A1, 0x30F6],
279         [0x30FB, 0x30FC],
280         [0x3105, 0x312C],
281         [0x4E00, 0x9FA5],
282         [0xAC00, 0xD7A3]
283     ];
284 
285     size_t high = ALPHA_TABLE.length - 1;
286     // Shortcut search if c is out of range
287     size_t low = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0;
288     // Binary search
289     while (low <= high)
290     {
291         const size_t mid = low + ((high - low) >> 1);
292         if (c < ALPHA_TABLE[mid][0])
293             high = mid - 1;
294         else if (ALPHA_TABLE[mid][1] < c)
295             low = mid + 1;
296         else
297         {
298             assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]);
299             return true;
300         }
301     }
302     return false;
303 }
304 
305 /**
306  * Returns the code length of c in code units.
307  */
308 int utf_codeLengthChar(dchar c)
309 {
310     if (c <= 0x7F)
311         return 1;
312     if (c <= 0x7FF)
313         return 2;
314     if (c <= 0xFFFF)
315         return 3;
316     if (c <= 0x10FFFF)
317         return 4;
318     assert(false);
319 }
320 
321 int utf_codeLengthWchar(dchar c)
322 {
323     return c <= 0xFFFF ? 1 : 2;
324 }
325 
326 /**
327  * Returns the code length of c in code units for the encoding.
328  * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32.
329  */
330 int utf_codeLength(int sz, dchar c)
331 {
332     if (sz == 1)
333         return utf_codeLengthChar(c);
334     if (sz == 2)
335         return utf_codeLengthWchar(c);
336     assert(sz == 4);
337     return 1;
338 }
339 
340 void utf_encodeChar(char* s, dchar c) @system
341 {
342     assert(s !is null);
343     assert(utf_isValidDchar(c));
344     if (c <= 0x7F)
345     {
346         s[0] = cast(char)c;
347     }
348     else if (c <= 0x07FF)
349     {
350         s[0] = cast(char)(0xC0 | (c >> 6));
351         s[1] = cast(char)(0x80 | (c & 0x3F));
352     }
353     else if (c <= 0xFFFF)
354     {
355         s[0] = cast(char)(0xE0 | (c >> 12));
356         s[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
357         s[2] = cast(char)(0x80 | (c & 0x3F));
358     }
359     else if (c <= 0x10FFFF)
360     {
361         s[0] = cast(char)(0xF0 | (c >> 18));
362         s[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
363         s[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
364         s[3] = cast(char)(0x80 | (c & 0x3F));
365     }
366     else
367         assert(0);
368 }
369 
370 void utf_encodeWchar(wchar* s, dchar c) @system
371 {
372     assert(s !is null);
373     assert(utf_isValidDchar(c));
374     if (c <= 0xFFFF)
375     {
376         s[0] = cast(wchar)c;
377     }
378     else
379     {
380         s[0] = cast(wchar)((((c - 0x010000) >> 10) & 0x03FF) + 0xD800);
381         s[1] = cast(wchar)(((c - 0x010000) & 0x03FF) + 0xDC00);
382     }
383 }
384 
385 void utf_encode(int sz, void* s, dchar c) @system
386 {
387     if (sz == 1)
388         utf_encodeChar(cast(char*)s, c);
389     else if (sz == 2)
390         utf_encodeWchar(cast(wchar*)s, c);
391     else
392     {
393         assert(sz == 4);
394         *(cast(dchar*)s) = c;
395     }
396 }
397 
398 /********************************************
399  * Checks whether an Unicode code point is a bidirectional
400  * control character.
401  */
402 bool isBidiControl(dchar c)
403 {
404     // Source: https://www.unicode.org/versions/Unicode15.0.0, table 23-3.
405     switch(c)
406     {
407         case '\u061C':
408         case '\u200E':
409         case '\u200F':
410         case '\u202A': .. case '\u202E':
411         case '\u2066': .. case '\u2069':
412             return true;
413         default:
414             return false;
415     }
416 }
417 
418 /********************************************
419  * Decode a UTF-8 sequence as a single UTF-32 code point.
420  * Params:
421  *      s = UTF-8 sequence
422  *      ridx = starting index in s[], updated to reflect number of code units decoded
423  *      rresult = set to character decoded
424  * Returns:
425  *      null on success, otherwise error message string
426  */
427 string utf_decodeChar(const(char)[] s, ref size_t ridx, out dchar rresult)
428 {
429     // UTF-8 decoding errors
430     static immutable string UTF8_DECODE_OK = null; // no error
431     static immutable string UTF8_DECODE_OUTSIDE_CODE_SPACE = "Outside Unicode code space";
432     static immutable string UTF8_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-8 sequence";
433     static immutable string UTF8_DECODE_OVERLONG = "Overlong UTF-8 sequence";
434     static immutable string UTF8_DECODE_INVALID_TRAILER = "Invalid trailing code unit";
435     static immutable string UTF8_DECODE_INVALID_CODE_POINT = "Invalid code point decoded";
436 
437     /* The following encodings are valid, except for the 5 and 6 byte
438      * combinations:
439      *      0xxxxxxx
440      *      110xxxxx 10xxxxxx
441      *      1110xxxx 10xxxxxx 10xxxxxx
442      *      11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
443      *      111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
444      *      1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
445      */
446     static immutable ubyte[256] UTF8_STRIDE =
447     [
448         1,1,1,1, 1,1,1,1,
449         1,1,1,1, 1,1,1,1,
450         1,1,1,1, 1,1,1,1,
451         1,1,1,1, 1,1,1,1,
452         1,1,1,1, 1,1,1,1,
453         1,1,1,1, 1,1,1,1,
454         1,1,1,1, 1,1,1,1,
455         1,1,1,1, 1,1,1,1,
456 
457         1,1,1,1, 1,1,1,1,
458         1,1,1,1, 1,1,1,1,
459         1,1,1,1, 1,1,1,1,
460         1,1,1,1, 1,1,1,1,
461         1,1,1,1, 1,1,1,1,
462         1,1,1,1, 1,1,1,1,
463         1,1,1,1, 1,1,1,1,
464         1,1,1,1, 1,1,1,1,
465 
466         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
467         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
468         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
469         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
470         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
471         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
472         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
473         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
474 
475         2,2,2,2, 2,2,2,2,
476         2,2,2,2, 2,2,2,2,
477         2,2,2,2, 2,2,2,2,
478         2,2,2,2, 2,2,2,2,
479 
480         3,3,3,3, 3,3,3,3,
481         3,3,3,3, 3,3,3,3,
482 
483         4,4,4,4, 4,4,4,4,
484         5,5,5,5, 6,6,0xFF,0xFF
485     ];
486 
487     assert(s !is null);
488     size_t i = ridx++;
489 
490     const char u = s[i];
491     // Pre-stage results for ASCII and error cases
492     rresult = u;
493     //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len);
494     // Get expected sequence length
495     const size_t n = UTF8_STRIDE[u];
496     switch (n)
497     {
498     case 1:
499         // ASCII
500         return UTF8_DECODE_OK;
501     case 2:
502     case 3:
503     case 4:
504         // multi-byte UTF-8
505         break;
506     default:
507         // 5- or 6-byte sequence
508         return UTF8_DECODE_OUTSIDE_CODE_SPACE;
509     }
510     if (s.length < i + n) // source too short
511         return UTF8_DECODE_TRUNCATED_SEQUENCE;
512     // Pick off 7 - n low bits from first code unit
513     dchar c = u & ((1 << (7 - n)) - 1);
514     /* The following combinations are overlong, and illegal:
515      *      1100000x (10xxxxxx)
516      *      11100000 100xxxxx (10xxxxxx)
517      *      11110000 1000xxxx (10xxxxxx 10xxxxxx)
518      *      11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
519      *      11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
520      */
521     const char u2 = s[++i];
522     // overlong combination
523     if ((u & 0xFE) == 0xC0 || (u == 0xE0 && (u2 & 0xE0) == 0x80) || (u == 0xF0 && (u2 & 0xF0) == 0x80) || (u == 0xF8 && (u2 & 0xF8) == 0x80) || (u == 0xFC && (u2 & 0xFC) == 0x80))
524         return UTF8_DECODE_OVERLONG;
525     // Decode remaining bits
526     for (const m = n + i - 1; i != m; ++i)
527     {
528         const u3 = s[i];
529         if ((u3 & 0xC0) != 0x80) // trailing bytes are 10xxxxxx
530             return UTF8_DECODE_INVALID_TRAILER;
531         c = (c << 6) | (u3 & 0x3F);
532     }
533     if (!utf_isValidDchar(c))
534         return UTF8_DECODE_INVALID_CODE_POINT;
535     ridx = i;
536     rresult = c;
537     return UTF8_DECODE_OK;
538 }
539 
540 /********************************************
541  * Decode a UTF-16 sequence as a single UTF-32 code point.
542  * Params:
543  *      s = UTF-16 sequence
544  *      ridx = starting index in s[], updated to reflect number of code units decoded
545  *      rresult = set to character decoded
546  * Returns:
547  *      null on success, otherwise error message string
548  */
549 string utf_decodeWchar(const(wchar)[] s, ref size_t ridx, out dchar rresult)
550 {
551     // UTF-16 decoding errors
552     static immutable string UTF16_DECODE_OK = null; // no error
553     static immutable string UTF16_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-16 sequence";
554     static immutable string UTF16_DECODE_INVALID_SURROGATE = "Invalid low surrogate";
555     static immutable string UTF16_DECODE_UNPAIRED_SURROGATE = "Unpaired surrogate";
556     static immutable string UTF16_DECODE_INVALID_CODE_POINT = "Invalid code point decoded";
557 
558     assert(s !is null);
559     size_t i = ridx++;
560 
561     // Pre-stage results for single wchar and error cases
562     dchar u = rresult = s[i];
563     if (u < 0xD800) // Single wchar codepoint
564         return UTF16_DECODE_OK;
565     if (0xD800 <= u && u <= 0xDBFF) // Surrogate pair
566     {
567         if (s.length <= i + 1)
568             return UTF16_DECODE_TRUNCATED_SEQUENCE;
569         wchar u2 = s[i + 1];
570         if (u2 < 0xDC00 || 0xDFFF < u)
571             return UTF16_DECODE_INVALID_SURROGATE;
572         u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
573         ++ridx;
574     }
575     else if (0xDC00 <= u && u <= 0xDFFF)
576         return UTF16_DECODE_UNPAIRED_SURROGATE;
577     if (!utf_isValidDchar(u))
578         return UTF16_DECODE_INVALID_CODE_POINT;
579     rresult = u;
580     return UTF16_DECODE_OK;
581 }