-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathU16Str.cpp
More file actions
107 lines (95 loc) · 4.37 KB
/
U16Str.cpp
File metadata and controls
107 lines (95 loc) · 4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#include <cstdint>
#include <cpp-pinyin/U16Str.h>
#include <string>
#include <stdexcept>
namespace Pinyin
{
std::string u16strToUtf8str(const char16_t &ch16) {
std::string utf8str;
utf8str.reserve(3); // UTF-16 characters could expand into 3 bytes in UTF-8
if (ch16 <= 0x7F) {
// 1-byte UTF-8
utf8str.push_back(static_cast<char>(ch16));
} else if (ch16 <= 0x7FF) {
// 2-byte UTF-8
utf8str.push_back(static_cast<char>(0xC0 | ((ch16 >> 6) & 0x1F)));
utf8str.push_back(static_cast<char>(0x80 | (ch16 & 0x3F)));
} else {
// 3-byte UTF-8
utf8str.push_back(static_cast<char>(0xE0 | ((ch16 >> 12) & 0x0F)));
utf8str.push_back(static_cast<char>(0x80 | ((ch16 >> 6) & 0x3F)));
utf8str.push_back(static_cast<char>(0x80 | (ch16 & 0x3F)));
}
return utf8str;
}
std::string u16strToUtf8str(const std::u16string &u16str) {
std::string utf8str;
utf8str.reserve(u16str.size() * 3); // UTF-16 characters could expand into 3 bytes in UTF-8
for (size_t i = 0; i < u16str.size(); ++i) {
const uint16_t ch = u16str[i];
if (ch < 0x80) {
// 1-byte sequence
utf8str.push_back(static_cast<char>(ch));
} else if (ch < 0x800) {
// 2-byte sequence
utf8str.push_back(static_cast<char>(0xC0 | (ch >> 6)));
utf8str.push_back(static_cast<char>(0x80 | (ch & 0x3F)));
} else if (ch >= 0xD800 && ch <= 0xDBFF) {
// High surrogate (part of a 4-byte UTF-16 character)
if (i + 1 >= u16str.size())
throw std::invalid_argument("Invalid UTF-16 surrogate pair");
const uint16_t low = u16str[i + 1];
if (low < 0xDC00 || low > 0xDFFF)
throw std::invalid_argument("Invalid UTF-16 surrogate pair");
const uint32_t codepoint = ((ch - 0xD800) << 10) + (low - 0xDC00) + 0x10000;
utf8str.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
utf8str.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
utf8str.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
utf8str.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
++i; // Skip next low surrogate
} else {
// 3-byte sequence
utf8str.push_back(static_cast<char>(0xE0 | (ch >> 12)));
utf8str.push_back(static_cast<char>(0x80 | ((ch >> 6) & 0x3F)));
utf8str.push_back(static_cast<char>(0x80 | (ch & 0x3F)));
}
}
return utf8str;
}
std::u16string utf8strToU16str(const std::string &utf8str) {
std::u16string u16str;
u16str.reserve(utf8str.size());
size_t i = 0;
while (i < utf8str.size()) {
const unsigned char c = utf8str[i];
if (c < 0x80) {
// 1-byte sequence
u16str.push_back(c);
++i;
} else if (c < 0xE0) {
// 2-byte sequence
if (i + 1 >= utf8str.size())
throw std::invalid_argument("Invalid UTF-8 sequence");
u16str.push_back(((c & 0x1F) << 6) | (utf8str[i + 1] & 0x3F));
i += 2;
} else if (c < 0xF0) {
// 3-byte sequence
if (i + 2 >= utf8str.size())
throw std::invalid_argument("Invalid UTF-8 sequence");
u16str.push_back(((c & 0x0F) << 12) | ((utf8str[i + 1] & 0x3F) << 6) | (utf8str[i + 2] & 0x3F));
i += 3;
} else {
// 4-byte sequence (assuming UTF-32 character, but storing in UTF-16)
if (i + 3 >= utf8str.size())
throw std::invalid_argument("Invalid UTF-8 sequence");
uint32_t codepoint = ((c & 0x07) << 18) | ((utf8str[i + 1] & 0x3F) << 12) |
((utf8str[i + 2] & 0x3F) << 6) | (utf8str[i + 3] & 0x3F);
codepoint -= 0x10000;
u16str.push_back(0xD800 | (codepoint >> 10)); // High surrogate
u16str.push_back(0xDC00 | (codepoint & 0x3FF)); // Low surrogate
i += 4;
}
}
return u16str;
}
}