00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00032 #ifndef _UCOMMON_UNICODE_H_
00033 #define _UCOMMON_UNICODE_H_
00034
00035 #ifndef _UCOMMON_STRING_H_
00036 #include <ucommon/string.h>
00037 #endif
00038
00039 NAMESPACE_UCOMMON
00040
00045 typedef int32_t ucs4_t;
00046
00050 typedef int16_t ucs2_t;
00051
00055 typedef void *unicode_t;
00056
00062 class __EXPORT utf8
00063 {
00064 public:
00068 static const unsigned ucsize;
00069
00073 static const char *nil;
00074
00080 static unsigned size(const char *codepoint);
00081
00087 static size_t count(const char *string);
00088
00095 static char *offset(char *string, ssize_t position);
00096
00102 static ucs4_t codepoint(const char *encoded);
00103
00109 static size_t chars(const unicode_t string);
00110
00116 static size_t chars(ucs4_t character);
00117
00124 static size_t unpack(const unicode_t string, CharacterProtocol& buffer);
00125
00133 static size_t pack(unicode_t unicode, CharacterProtocol& buffer, size_t size);
00134
00138 static ucs4_t *udup(const char *string);
00139
00143 static ucs2_t *wdup(const char *string);
00144
00152 static const char *find(const char *string, ucs4_t character, size_t start = 0);
00153
00161 static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l);
00162
00169 static unsigned ccount(const char *string, ucs4_t character);
00170
00176 static ucs4_t get(CharacterProtocol& buffer);
00177
00184 static ucs4_t put(ucs4_t character, CharacterProtocol& buffer);
00185 };
00186
00193 class __EXPORT UString : public String, public utf8
00194 {
00195 protected:
00199 UString();
00200
00205 UString(strsize_t size);
00206
00211 UString(const unicode_t text);
00212
00219 UString(const char *text, strsize_t size);
00220
00227 UString(const unicode_t *text, const unicode_t *end);
00228
00234 UString(const UString& existing);
00235
00240 virtual ~UString();
00241
00248 UString get(strsize_t codepoint, strsize_t size = 0) const;
00249
00256 size_t get(unicode_t unicode, size_t size) const;
00257
00262 void set(const unicode_t unicode);
00263
00268 void add(const unicode_t unicode);
00269
00275 ucs4_t at(int position) const;
00276
00283 inline size_t operator()(unicode_t unicode, size_t size) const
00284 {return get(unicode, size);};
00285
00292 UString operator()(int codepoint, strsize_t size) const;
00293
00299 inline UString left(strsize_t size) const
00300 {return operator()(0, size);}
00301
00307 inline UString right(strsize_t offset) const
00308 {return operator()(-((int)offset), 0);}
00309
00316 inline UString copy(strsize_t offset, strsize_t size) const
00317 {return operator()((int)offset, size);}
00318
00324 void cut(strsize_t offset, strsize_t size = 0);
00325
00332 void paste(strsize_t offset, const char *text, strsize_t size = 0);
00333
00341 const char *operator()(int offset) const;
00342
00348 inline ucs4_t operator[](int position) const
00349 {return UString::at(position);};
00350
00355 inline strsize_t count(void) const
00356 {return utf8::count(str->text);}
00357
00363 unsigned ccount(ucs4_t character) const;
00364
00371 const char *find(ucs4_t character, strsize_t start = 0) const;
00372
00379 const char *rfind(ucs4_t character, strsize_t end = npos) const;
00380 };
00381
00387 class __EXPORT utf8_pointer
00388 {
00389 protected:
00390 uint8_t *text;
00391
00392 public:
00396 utf8_pointer();
00397
00402 utf8_pointer(const char *string);
00403
00408 utf8_pointer(const utf8_pointer& copy);
00409
00414 utf8_pointer& operator ++();
00415
00420 utf8_pointer& operator --();
00421
00427 utf8_pointer& operator +=(long offset);
00428
00434 utf8_pointer& operator -=(long offset);
00435
00441 utf8_pointer operator+(long offset) const;
00442
00448 utf8_pointer operator-(long offset) const;
00449
00454 inline operator bool() const
00455 {return text != NULL;};
00456
00461 inline bool operator!() const
00462 {return text == NULL;};
00463
00469 ucs4_t operator[](long codepoint) const;
00470
00476 utf8_pointer& operator=(const char *string);
00477
00481 void inc(void);
00482
00486 void dec(void);
00487
00493 inline bool operator==(const char *string) const
00494 {return (const char *)text == string;};
00495
00501 inline bool operator!=(const char *string) const
00502 {return (const char *)text != string;};
00503
00508 inline ucs4_t operator*() const
00509 {return utf8::codepoint((const char *)text);};
00510
00515 inline char *c_str(void) const
00516 {return (char *)text;};
00517
00522 inline operator char*() const
00523 {return (char *)text;};
00524
00529 inline size_t len(void) const
00530 {return utf8::count((const char *)text);};
00531 };
00532
00533 inline ucs4_t *strudup(const char *string)
00534 {return utf8::udup(string);}
00535
00536 inline ucs2_t *strwdup(const char *string)
00537 {return utf8::wdup(string);}
00538
00539 __EXPORT unicode_t unidup(const char *string);
00540
00541 template<>
00542 inline void dupfree<ucs2_t*>(ucs2_t *string)
00543 {::free(string);}
00544
00545 template<>
00546 inline void dupfree<ucs4_t*>(ucs4_t *string)
00547 {::free(string);}
00548
00549 template<>
00550 inline void dupfree<unicode_t>(unicode_t string)
00551 {::free(string);}
00552
00556 typedef UString ustring_t;
00557
00561 typedef utf8_pointer utf8_t;
00562
00563 END_NAMESPACE
00564
00565 #endif