Open Chinese Convert  1.0.3
A project for conversion between Traditional and Simplified Chinese
UTF8Util.hpp
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2013 BYVoid <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #pragma once
20 
21 #include "Common.hpp"
22 
23 namespace opencc {
28 class OPENCC_EXPORT UTF8Util {
29 public:
33  static void SkipUtf8Bom(FILE* fp);
34 
39  static size_t NextCharLengthNoException(const char* str) {
40  char ch = *str;
41  if ((ch & 0xF0) == 0xE0) {
42  return 3;
43  } else if ((ch & 0x80) == 0x00) {
44  return 1;
45  } else if ((ch & 0xE0) == 0xC0) {
46  return 2;
47  } else if ((ch & 0xF8) == 0xF0) {
48  return 4;
49  } else if ((ch & 0xFC) == 0xF8) {
50  return 5;
51  } else if ((ch & 0xFE) == 0xFC) {
52  return 6;
53  }
54  return 0;
55  }
56 
60  static size_t NextCharLength(const char* str) {
61  size_t length = NextCharLengthNoException(str);
62  if (length == 0) {
63  throw InvalidUTF8(str);
64  }
65  return length;
66  }
67 
71  static size_t PrevCharLength(const char* str) {
72  {
73  const size_t length = NextCharLengthNoException(str - 3);
74  if (length == 3) {
75  return length;
76  }
77  }
78  {
79  const size_t length = NextCharLengthNoException(str - 1);
80  if (length == 1) {
81  return length;
82  }
83  }
84  {
85  const size_t length = NextCharLengthNoException(str - 2);
86  if (length == 2) {
87  return length;
88  }
89  }
90  for (size_t i = 4; i <= 6; i++) {
91  const size_t length = NextCharLengthNoException(str - i);
92  if (length == i) {
93  return length;
94  }
95  }
96  throw InvalidUTF8(str);
97  }
98 
102  static const char* NextChar(const char* str) {
103  return str + NextCharLength(str);
104  }
105 
109  static const char* PrevChar(const char* str) {
110  return str - PrevCharLength(str);
111  }
112 
116  static size_t Length(const char* str) {
117  size_t length = 0;
118  while (*str != '\0') {
119  str = NextChar(str);
120  length++;
121  }
122  return length;
123  }
124 
131  static const char* FindNextInline(const char* str, const char ch) {
132  while (!IsLineEndingOrFileEnding(*str) && *str != ch) {
133  str = NextChar(str);
134  }
135  return str;
136  }
137 
141  static bool IsLineEndingOrFileEnding(const char ch) {
142  return ch == '\0' || ch == '\n' || ch == '\r';
143  }
144 
148  static string FromSubstr(const char* str, size_t length) {
149  string newStr;
150  newStr.resize(length);
151  strncpy(const_cast<char*>(newStr.c_str()), str, length);
152  return newStr;
153  }
154 
158  static bool NotShorterThan(const char* str, size_t byteLength) {
159  while (byteLength > 0) {
160  if (*str == '\0') {
161  return false;
162  }
163  byteLength--;
164  str++;
165  }
166  return true;
167  }
168 
173  static string TruncateUTF8(const char* str, size_t maxByteLength) {
174  string wordTrunc;
175  if (NotShorterThan(str, maxByteLength)) {
176  size_t len = 0;
177  const char* pStr = str;
178  for (;;) {
179  const size_t charLength = NextCharLength(pStr);
180  if (len + charLength > maxByteLength) {
181  break;
182  }
183  pStr += charLength;
184  len += charLength;
185  }
186  wordTrunc = FromSubstr(str, len);
187  } else {
188  wordTrunc = str;
189  }
190  return wordTrunc;
191  }
192 
196  static void ReplaceAll(string& str, const char* from, const char* to) {
197  string::size_type pos = 0;
198  string::size_type fromLen = strlen(from);
199  string::size_type toLen = strlen(to);
200  while ((pos = str.find(from, pos)) != string::npos) {
201  str.replace(pos, fromLen, to);
202  pos += toLen;
203  }
204  }
205 
209  static string Join(const vector<string>& strings, const string& separator) {
210  std::ostringstream buffer;
211  bool first = true;
212  for (const auto& str : strings) {
213  if (!first) {
214  buffer << separator;
215  }
216  buffer << str;
217  first = false;
218  }
219  return buffer.str();
220  }
221 
225  static string Join(const vector<string>& strings) {
226  std::ostringstream buffer;
227  for (const auto& str : strings) {
228  buffer << str;
229  }
230  return buffer.str();
231  }
232 
233  static void GetByteMap(const char* str, const size_t utf8Length,
234  vector<size_t>* byteMap) {
235  if (byteMap->size() < utf8Length) {
236  byteMap->resize(utf8Length);
237  }
238  const char* pstr = str;
239  for (size_t i = 0; i < utf8Length; i++) {
240  (*byteMap)[i] = pstr - str;
241  pstr = NextChar(pstr);
242  }
243  }
244 };
245 }
static size_t PrevCharLength(const char *str)
Returns the length in byte for the previous UTF8 character.
Definition: UTF8Util.hpp:71
static string Join(const vector< string > &strings)
Joins a string vector in to a string.
Definition: UTF8Util.hpp:225
Definition: Exception.hpp:77
static string Join(const vector< string > &strings, const string &separator)
Joins a string vector in to a string with a separator.
Definition: UTF8Util.hpp:209
static string TruncateUTF8(const char *str, size_t maxByteLength)
Truncates a string with a maximal length in byte.
Definition: UTF8Util.hpp:173
static void ReplaceAll(string &str, const char *from, const char *to)
Replaces all patterns in a string in place.
Definition: UTF8Util.hpp:196
static string FromSubstr(const char *str, size_t length)
Copies a substring with given length to a new std::string.
Definition: UTF8Util.hpp:148
static bool NotShorterThan(const char *str, size_t byteLength)
Returns true if the given string is longer or as long as the given length.
Definition: UTF8Util.hpp:158
static size_t Length(const char *str)
Returns the UTF8 length of a valid UTF8 string.
Definition: UTF8Util.hpp:116
Definition: BinaryDict.hpp:24
static const char * PrevChar(const char *str)
Move the char* pointer before the previous UTF8 character.
Definition: UTF8Util.hpp:109
static const char * NextChar(const char *str)
Returns the char* pointer over the next UTF8 character.
Definition: UTF8Util.hpp:102
UTF8 string utilities.
Definition: UTF8Util.hpp:28
static const char * FindNextInline(const char *str, const char ch)
Finds a character in the same line.
Definition: UTF8Util.hpp:131
static size_t NextCharLength(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:60
static bool IsLineEndingOrFileEnding(const char ch)
Returns ture if the character is a line ending or end of file.
Definition: UTF8Util.hpp:141
static size_t NextCharLengthNoException(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:39