Open Chinese Convert  1.0.3
A project for conversion between Traditional and Simplified Chinese
PhraseExtract.hpp
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2015 BYVoid <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #pragma once
20 
21 #include <unordered_map>
22 
23 #include "Common.hpp"
24 #include "UTF8StringSlice.hpp"
25 
26 namespace opencc {
27 
29 public:
30  typedef UTF8StringSlice::LengthType LengthType;
31 
33 
34  PhraseExtract();
35 
36  virtual ~PhraseExtract();
37 
38  void Extract(const string& text) {
39  SetFullText(text);
40  ExtractSuffixes();
41  CalculateFrequency();
42  CalculateSuffixEntropy();
43  ReleaseSuffixes();
44  ExtractPrefixes();
45  CalculatePrefixEntropy();
46  ReleasePrefixes();
47  ExtractWordCandidates();
48  CalculateCohesions();
49  SelectWords();
50  }
51 
52  void SetFullText(const string& fullText) {
53  utf8FullText = UTF8StringSlice(fullText.c_str());
54  }
55 
56  void SetFullText(const char* fullText) {
57  utf8FullText = UTF8StringSlice(fullText);
58  }
59 
60  void SetFullText(const UTF8StringSlice& fullText) { utf8FullText = fullText; }
61 
62  void SetWordMinLength(const LengthType _wordMinLength) {
63  wordMinLength = _wordMinLength;
64  }
65 
66  void SetWordMaxLength(const LengthType _wordMaxLength) {
67  wordMaxLength = _wordMaxLength;
68  }
69 
70  void SetPrefixSetLength(const LengthType _prefixSetLength) {
71  prefixSetLength = _prefixSetLength;
72  }
73 
74  void SetSuffixSetLength(const LengthType _suffixSetLength) {
75  suffixSetLength = _suffixSetLength;
76  }
77 
78  // PreCalculationFilter is called after frequencies statistics.
79  void SetPreCalculationFilter(const std::function<
80  bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>& filter) {
81  preCalculationFilter = filter;
82  }
83 
84  void SetPostCalculationFilter(const std::function<
85  bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>& filter) {
86  postCalculationFilter = filter;
87  }
88 
89  void ReleaseSuffixes() { vector<UTF8StringSlice8Bit>().swap(suffixes); }
90 
91  void ReleasePrefixes() { vector<UTF8StringSlice8Bit>().swap(prefixes); }
92 
93  const vector<UTF8StringSlice8Bit>& Words() const { return words; }
94 
95  const vector<UTF8StringSlice8Bit>& WordCandidates() const {
96  return wordCandidates;
97  }
98 
99  struct Signals {
100  size_t frequency;
101  double cohesion;
102  double suffixEntropy;
103  double prefixEntropy;
104  };
105 
106  const Signals& Signal(const UTF8StringSlice8Bit& wordCandidate) const;
107 
108  double Cohesion(const UTF8StringSlice8Bit& wordCandidate) const;
109 
110  double Entropy(const UTF8StringSlice8Bit& wordCandidate) const;
111 
112  double SuffixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
113 
114  double PrefixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
115 
116  size_t Frequency(const UTF8StringSlice8Bit& word) const;
117 
118  double Probability(const UTF8StringSlice8Bit& word) const;
119 
120  double LogProbability(const UTF8StringSlice8Bit& word) const;
121 
122  void Reset();
123 
124  void ExtractSuffixes();
125 
126  void ExtractPrefixes();
127 
128  void ExtractWordCandidates();
129 
130  void CalculateFrequency();
131 
132  void CalculateCohesions();
133 
134  void CalculateSuffixEntropy();
135 
136  void CalculatePrefixEntropy();
137 
138  void SelectWords();
139 
140  static bool
141  DefaultPreCalculationFilter(const PhraseExtract&,
143 
144  static bool
145  DefaultPostCalculationFilter(const PhraseExtract&,
147 
148 private:
149  class DictType;
150 
151  // Pointwise Mutual Information
152  double PMI(const UTF8StringSlice8Bit& wordCandidate,
153  const UTF8StringSlice8Bit& part1,
154  const UTF8StringSlice8Bit& part2) const;
155 
156  double CalculateCohesion(const UTF8StringSlice8Bit& wordCandidate) const;
157 
158  double CalculateEntropy(const std::unordered_map<
159  UTF8StringSlice8Bit, size_t, UTF8StringSlice8Bit::Hasher>& choices) const;
160 
161  LengthType wordMinLength;
162  LengthType wordMaxLength;
163  LengthType prefixSetLength;
164  LengthType suffixSetLength;
165  std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
166  preCalculationFilter;
167  std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
168  postCalculationFilter;
169 
170  bool prefixesExtracted;
171  bool suffixesExtracted;
172  bool frequenciesCalculated;
173  bool wordCandidatesExtracted;
174  bool cohesionsCalculated;
175  bool prefixEntropiesCalculated;
176  bool suffixEntropiesCalculated;
177  bool wordsSelected;
178 
179  UTF8StringSlice utf8FullText;
180  size_t totalOccurrence;
181  double logTotalOccurrence;
182  vector<UTF8StringSlice8Bit> prefixes;
183  vector<UTF8StringSlice8Bit> suffixes;
184  vector<UTF8StringSlice8Bit> wordCandidates;
185  vector<UTF8StringSlice8Bit> words;
186  DictType* signals;
187 
188  friend class PhraseExtractTest;
189 };
190 
191 } // namespace opencc
Definition: PhraseExtract.hpp:99
Definition: PhraseExtract.hpp:28
Definition: UTF8StringSlice.hpp:198
Definition: BinaryDict.hpp:24
Definition: UTF8StringSlice.hpp:50