28 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
29 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
38 typedef unsigned char uint8_t;
39 typedef unsigned short uint16_t;
40 typedef unsigned int uint32_t;
48 const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
49 const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
50 const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
51 const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
52 const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
53 const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
56 const uint32_t CODE_POINT_MAX = 0x0010ffffu;
58 template<
typename octet_type>
59 inline uint8_t mask8(octet_type oc)
61 return static_cast<uint8_t
>(0xff & oc);
63 template<
typename u16_type>
64 inline uint16_t mask16(u16_type oc)
66 return static_cast<uint16_t
>(0xffff & oc);
68 template<
typename octet_type>
69 inline bool is_trail(octet_type oc)
71 return ((mask8(oc) >> 6) == 0x2);
74 template <
typename u16>
75 inline bool is_surrogate(u16 cp)
77 return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
80 template <
typename u32>
81 inline bool is_code_point_valid(u32 cp)
83 return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
86 template <
typename octet_iterator>
87 inline typename std::iterator_traits<octet_iterator>::difference_type
88 sequence_length(octet_iterator lead_it)
90 uint8_t lead = mask8(*lead_it);
93 else if ((lead >> 5) == 0x6)
95 else if ((lead >> 4) == 0xe)
97 else if ((lead >> 3) == 0x1e)
103 enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
105 template <
typename octet_iterator>
106 utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
108 uint32_t cp = mask8(*it);
110 typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
111 octet_difference_type length = sequence_length(it);
122 return NOT_ENOUGH_ROOM;
126 if (std::distance(it, end) < length)
127 return NOT_ENOUGH_ROOM;
135 if (is_trail(*(++it))) {
136 cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
140 return INCOMPLETE_SEQUENCE;
144 if (is_trail(*(++it))) {
145 cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
146 if (is_trail(*(++it))) {
150 std::advance(it, -2);
151 return INCOMPLETE_SEQUENCE;
156 return INCOMPLETE_SEQUENCE;
160 if (is_trail(*(++it))) {
161 cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);
162 if (is_trail(*(++it))) {
163 cp += (mask8(*it) << 6) & 0xfff;
164 if (is_trail(*(++it))) {
168 std::advance(it, -3);
169 return INCOMPLETE_SEQUENCE;
173 std::advance(it, -2);
174 return INCOMPLETE_SEQUENCE;
179 return INCOMPLETE_SEQUENCE;
184 if (!is_code_point_valid(cp)) {
185 for (octet_difference_type i = 0; i < length - 1; ++i)
187 return INVALID_CODE_POINT;
195 std::advance(it, -(length-1));
196 return OVERLONG_SEQUENCE;
199 else if (cp < 0x800) {
201 std::advance(it, -(length-1));
202 return OVERLONG_SEQUENCE;
205 else if (cp < 0x10000) {
207 std::advance(it, -(length-1));
208 return OVERLONG_SEQUENCE;
216 template <
typename octet_iterator>
217 inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
218 return validate_next(it, end, 0);
226 const uint8_t bom[] = {0xef, 0xbb, 0xbf};
228 template <
typename octet_iterator>
229 octet_iterator find_invalid(octet_iterator start, octet_iterator end)
231 octet_iterator result = start;
232 while (result != end) {
233 internal::utf_error err_code = internal::validate_next(result, end);
234 if (err_code != internal::OK)
240 template <
typename octet_iterator>
241 inline bool is_valid(octet_iterator start, octet_iterator end)
243 return (find_invalid(start, end) == end);
246 template <
typename octet_iterator>
247 inline bool is_bom (octet_iterator it)
250 (internal::mask8(*it++)) == bom[0] &&
251 (internal::mask8(*it++)) == bom[1] &&
252 (internal::mask8(*it)) == bom[2]
257 #endif // header guard