Open Chinese Convert  1.0.3
A project for conversion between Traditional and Simplified Chinese
UTF8StringSlice.hpp
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2015 BYVoid <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #include "Common.hpp"
20 #include "UTF8Util.hpp"
21 
22 namespace opencc {
23 
24 namespace internal {
25 
26 inline size_t FNVHash(const char* text, const size_t byteLength,
27  const size_t FNV_prime, const size_t FNV_offset_basis) {
28  size_t hash = FNV_offset_basis;
29  for (const char* pstr = text; pstr < text + byteLength; pstr++) {
30  hash ^= *pstr;
31  hash *= FNV_prime;
32  }
33  return hash;
34 }
35 
36 template <int> size_t FNVHash(const char* text, const size_t byteLength);
37 
38 template <>
39 inline size_t FNVHash<4>(const char* text, const size_t byteLength) {
40  return FNVHash(text, byteLength, 16777619UL, 2166136261UL);
41 }
42 
43 template <>
44 inline size_t FNVHash<8>(const char* text, const size_t byteLength) {
45  return FNVHash(text, byteLength, 1099511628211UL, 14695981039346656037UL);
46 }
47 
48 } // namespace internal
49 
50 template <typename LENGTH_TYPE> class UTF8StringSliceBase {
51 public:
52  typedef LENGTH_TYPE LengthType;
53 
54  UTF8StringSliceBase(const char* _str)
55  : str(_str), utf8Length(UTF8Util::Length(_str)),
56  byteLength(strlen(_str)) {}
57 
58  UTF8StringSliceBase(const char* _str, const LengthType _utf8Length)
59  : str(_str), utf8Length(_utf8Length) {
60  CalculateByteLength();
61  }
62 
63  UTF8StringSliceBase(const char* _str, const LengthType _utf8Length,
64  const LengthType _byteLength)
65  : str(_str), utf8Length(_utf8Length), byteLength(_byteLength) {
66  CalculateByteLength();
67  }
68 
69  LengthType UTF8Length() const { return utf8Length; }
70 
71  LengthType ByteLength() const { return byteLength; }
72 
73  UTF8StringSliceBase Left(const LengthType utf8Length) const {
74  if (utf8Length == UTF8Length()) {
75  return *this;
76  } else {
77  return UTF8StringSliceBase(str, utf8Length);
78  }
79  }
80 
81  UTF8StringSliceBase Right(const LengthType utf8Length) const {
82  if (utf8Length == UTF8Length()) {
83  return *this;
84  } else {
85  const char* pstr = str + byteLength;
86  for (size_t i = 0; i < utf8Length; i++) {
87  pstr = UTF8Util::PrevChar(pstr);
88  }
89  return UTF8StringSliceBase(pstr, utf8Length);
90  }
91  }
92 
93  UTF8StringSliceBase SubString(const LengthType offset,
94  const LengthType utf8Length) const {
95  if (offset == 0) {
96  return Left(utf8Length);
97  } else {
98  const char* pstr = str;
99  for (size_t i = 0; i < offset; i++) {
100  pstr = UTF8Util::NextChar(pstr);
101  }
102  return UTF8StringSliceBase(pstr, utf8Length);
103  }
104  }
105 
106  string ToString() const { return string(str, str + byteLength); }
107 
108  const char* CString() const { return str; }
109 
110  LengthType CommonPrefixLength(const UTF8StringSliceBase& that) const {
111  if (str == that.str) {
112  return std::min(utf8Length, that.utf8Length);
113  } else {
114  const char* pstr1 = str;
115  const char* pstr2 = that.str;
116  for (size_t length = 0; length < utf8Length && length < that.utf8Length;
117  length++) {
118  size_t charLen1 = UTF8Util::NextCharLength(pstr1);
119  size_t charLen2 = UTF8Util::NextCharLength(pstr2);
120  if (charLen1 != charLen2 || strncmp(pstr1, pstr2, charLen1) != 0) {
121  return length;
122  }
123  pstr1 += charLen1;
124  pstr2 += charLen2;
125  }
126  return 0;
127  }
128  }
129 
130  void MoveRight() {
131  if (utf8Length > 0) {
132  const size_t charLen = UTF8Util::NextCharLength(str);
133  str += charLen;
134  utf8Length--;
135  byteLength -= charLen;
136  }
137  }
138 
139  void MoveLeft() {
140  if (utf8Length > 0) {
141  const size_t charLen = UTF8Util::PrevCharLength(str + byteLength);
142  utf8Length--;
143  byteLength -= charLen;
144  }
145  }
146 
147  int ReverseCompare(const UTF8StringSliceBase& that) const {
148  const char* pstr1 = str + byteLength;
149  const char* pstr2 = that.str + that.byteLength;
150  const size_t length = std::min(utf8Length, that.utf8Length);
151  for (size_t i = 0; i < length; i++) {
152  const size_t charLen1 = UTF8Util::PrevCharLength(pstr1);
153  const size_t charLen2 = UTF8Util::PrevCharLength(pstr2);
154  pstr1 -= charLen1;
155  pstr2 -= charLen2;
156  const int cmp = strncmp(pstr1, pstr2, std::min(charLen1, charLen2));
157  if (cmp < 0) {
158  return -1;
159  } else if (cmp > 0) {
160  return 1;
161  } else if (charLen1 < charLen2) {
162  return -1;
163  } else if (charLen1 > charLen2) {
164  return 1;
165  }
166  }
167  if (utf8Length < that.utf8Length) {
168  return -1;
169  } else if (utf8Length > that.utf8Length) {
170  return 1;
171  } else {
172  return 0;
173  }
174  }
175 
176  LengthType FindBytePosition(const UTF8StringSliceBase& pattern) const {
177  return static_cast<LengthType>(
178  ToString().find(pattern.str, 0, pattern.byteLength));
179  }
180 
181  bool operator<(const UTF8StringSliceBase& that) const {
182  return Compare(that) < 0;
183  }
184 
185  bool operator>(const UTF8StringSliceBase& that) const {
186  return Compare(that) > 0;
187  }
188 
189  bool operator==(const UTF8StringSliceBase& that) const {
190  return (str == that.str && utf8Length == that.utf8Length) ||
191  Compare(that) == 0;
192  }
193 
194  bool operator!=(const UTF8StringSliceBase& that) const {
195  return !this->operator==(that);
196  }
197 
198  class Hasher {
199  public:
200  size_t operator()(const UTF8StringSliceBase& text) const {
201  return internal::FNVHash<sizeof(size_t)>(text.CString(),
202  text.ByteLength());
203  }
204  };
205 
206 private:
207  inline int Compare(const UTF8StringSliceBase& that) const {
208  int cmp = strncmp(str, that.str, std::min(byteLength, that.byteLength));
209  if (cmp == 0) {
210  if (utf8Length < that.utf8Length) {
211  cmp = -1;
212  } else if (utf8Length > that.utf8Length) {
213  cmp = 1;
214  } else {
215  cmp = 0;
216  }
217  }
218  return cmp;
219  }
220 
221  void CalculateByteLength() {
222  const char* pstr = str;
223  for (size_t i = 0; i < utf8Length; i++) {
224  pstr = UTF8Util::NextChar(pstr);
225  }
226  byteLength = pstr - str;
227  }
228 
229  const char* str;
230  LengthType utf8Length;
231  LengthType byteLength;
232 };
233 
235 
236 template <typename LENGTH_TYPE>
237 std::ostream& operator<<(::std::ostream& os,
239  return os << str.ToString();
240 }
241 
242 } // namespace opencc
static size_t PrevCharLength(const char *str)
Returns the length in byte for the previous UTF8 character.
Definition: UTF8Util.hpp:71
Definition: UTF8StringSlice.hpp:198
static size_t Length(const char *str)
Returns the UTF8 length of a valid UTF8 string.
Definition: UTF8Util.hpp:116
Definition: BinaryDict.hpp:24
static const char * PrevChar(const char *str)
Move the char* pointer before the previous UTF8 character.
Definition: UTF8Util.hpp:109
static const char * NextChar(const char *str)
Returns the char* pointer over the next UTF8 character.
Definition: UTF8Util.hpp:102
static size_t NextCharLength(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:60
Definition: UTF8StringSlice.hpp:50