SHOGUN  3.2.1
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
StreamingMMD.cpp
浏览该文件的文档.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2012-2013 Heiko Strathmann
4  * Written (w) 2014 Soumyajit De
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright notice, this
11  * list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  * this list of conditions and the following disclaimer in the documentation
14  * and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * The views and conclusions contained in the software and documentation are those
28  * of the authors and should not be interpreted as representing official policies,
29  * either expressed or implied, of the Shogun Development Team.
30  */
31 
36 #include <shogun/lib/List.h>
37 
38 using namespace shogun;
39 
41 {
42  init();
43 }
44 
46  CStreamingFeatures* q, index_t m, index_t blocksize) :
47  CKernelTwoSampleTest(kernel, NULL, m)
48 {
49  init();
50 
51  m_streaming_p=p;
53 
54  m_streaming_q=q;
56 
57  m_blocksize=blocksize;
58 }
59 
61 {
64 
65  /* m_kernel is SG_UNREFed in base desctructor */
66 }
67 
68 void CStreamingMMD::init()
69 {
70  SG_ADD((CSGObject**)&m_streaming_p, "streaming_p", "Streaming features p",
72  SG_ADD((CSGObject**)&m_streaming_q, "streaming_q", "Streaming features p",
74  SG_ADD(&m_blocksize, "blocksize", "Number of elements processed at once",
76  SG_ADD(&m_simulate_h0, "simulate_h0", "Whether p and q are mixed",
78 
79  m_streaming_p=NULL;
80  m_streaming_q=NULL;
81  m_blocksize=10000;
82  m_simulate_h0=false;
83 }
84 
86 {
87  /* use wrapper method and compute for single kernel */
88  SGVector<float64_t> statistic;
89  SGVector<float64_t> variance;
90  compute_statistic_and_variance(statistic, variance, false);
91 
92  return statistic[0];
93 }
94 
96 {
97  /* make sure multiple_kernels flag is used only with a combined kernel */
98  REQUIRE(!multiple_kernels || m_kernel->get_kernel_type()==K_COMBINED,
99  "multiple kernels specified, but underlying kernel is not of type "
100  "K_COMBINED\n");
101 
102  SGVector<float64_t> statistic;
103  SGVector<float64_t> variance;
104  compute_statistic_and_variance(statistic, variance, multiple_kernels);
105 
106  return statistic;
107 }
108 
110 {
111  /* use wrapper method and compute for single kernel */
112  SGVector<float64_t> statistic;
113  SGVector<float64_t> variance;
114  compute_statistic_and_variance(statistic, variance, false);
115 
116  return variance[0];
117 }
118 
120 {
121  float64_t result=0;
122 
124  {
125  case MMD1_GAUSSIAN:
126  {
127  /* compute variance and use to estimate Gaussian distribution */
129  result=1.0-CStatistics::normal_cdf(statistic, std_dev);
130  }
131  break;
132 
133  default:
134  /* sampling null is handled here */
135  result=CKernelTwoSampleTest::compute_p_value(statistic);
136  break;
137  }
138 
139  return result;
140 }
141 
143 {
144  float64_t result=0;
145 
147  {
148  case MMD1_GAUSSIAN:
149  {
150  /* compute variance and use to estimate Gaussian distribution */
152  result=1.0-CStatistics::inverse_normal_cdf(1-alpha, 0, std_dev);
153  }
154  break;
155 
156  default:
157  /* sampling null is handled here */
159  break;
160  }
161 
162  return result;
163 }
164 
166 {
167  float64_t result=0;
168 
170  {
171  case MMD1_GAUSSIAN:
172  {
173  /* compute variance and use to estimate Gaussian distribution, use
174  * wrapper method and compute for single kernel */
175  SGVector<float64_t> statistic;
176  SGVector<float64_t> variance;
177  compute_statistic_and_variance(statistic, variance, false);
178 
179  /* estimate Gaussian distribution */
180  result=1.0-CStatistics::normal_cdf(statistic[0],
181  CMath::sqrt(variance[0]));
182  }
183  break;
184 
185  default:
186  /* sampling null can be done separately in superclass */
188  break;
189  }
190 
191  return result;
192 }
193 
195 {
197 
198  /* instead of permutating samples, just samples new data all the time. */
201  SG_REF(p);
202  SG_REF(q);
203 
204  bool old=m_simulate_h0;
205  set_simulate_h0(true);
206  for (index_t i=0; i<m_num_null_samples; ++i)
207  {
208  /* compute statistic for this permutation of mixed samples */
209  samples[i]=compute_statistic();
210  }
211  set_simulate_h0(old);
212  m_streaming_p=p;
213  m_streaming_q=q;
214  SG_UNREF(p);
215  SG_UNREF(q);
216 
217  return samples;
218 }
219 
221  index_t num_this_run)
222 {
223  SG_DEBUG("entering!\n");
224 
225  /* the list of blocks of data to be returned, turning delete_data flag
226  * on which SG_REFs the elements when appended or returned. */
227  CList* data=new CList(true);
228 
229  SG_DEBUG("streaming %d blocks from p of blocksize %d!\n", num_blocks,
230  num_this_run);
231 
232  /* stream data from p num_blocks of time*/
233  for (index_t i=0; i<num_blocks; ++i)
234  {
235  CFeatures* block=m_streaming_p->get_streamed_features(num_this_run);
236  data->append_element(block);
237  }
238 
239  SG_DEBUG("streaming %d blocks from q of blocksize %d!\n", num_blocks,
240  num_this_run);
241 
242  /* stream data from q num_blocks of time*/
243  for (index_t i=0; i<num_blocks; ++i)
244  {
245  CFeatures* block=m_streaming_q->get_streamed_features(num_this_run);
246  data->append_element(block);
247  }
248 
249  /* check whether h0 should be simulated and permute if so */
250  if (m_simulate_h0)
251  {
252  /* create merged copy of all feature instances to permute */
253  SG_DEBUG("merging and premuting features!\n");
254 
255  /* use the first element to merge rest of the data into */
256  CFeatures* merged=(CFeatures*)data->get_first_element();
257  data->delete_element();
258  merged=merged->create_merged_copy(data);
259 
260  /* get rid of unnecessary feature objects */
261  data->delete_all_elements();
262 
263  /* permute */
264  SGVector<index_t> inds(merged->get_num_vectors());
265  inds.range_fill();
266  inds.permute();
267  merged->add_subset(inds);
268 
269  /* copy back */
270  SGVector<index_t> copy(num_this_run);
271  copy.range_fill();
272  for (index_t i=0; i<2*num_blocks; ++i)
273  {
274  CFeatures* current=merged->copy_subset(copy);
275  data->append_element(current);
276  /* SG_UNREF'ing since copy_subset does a SG_REF, this is
277  * safe since the object is already SG_REF'ed inside the list */
278  SG_UNREF(current);
279 
280  if (i<2*num_blocks-1)
281  copy.add(num_this_run);
282  }
283 
284  /* clean up */
285  SG_UNREF(merged);
286  }
287 
288  SG_REF(data);
289 
290  SG_DEBUG("leaving!\n");
291  return data;
292 }
293 
295 {
296  SG_ERROR("Method not implemented since linear time mmd is based on "
297  "streaming features\n");
298 }
299 
301 {
302  SG_ERROR("Method not implemented since linear time mmd is based on "
303  "streaming features\n");
304  return NULL;
305 }
306 
308 {
310  return m_streaming_p;
311 }
312 
314 {
316  return m_streaming_q;
317 }
318 
virtual float64_t compute_threshold(float64_t alpha)
virtual void compute_statistic_and_variance(SGVector< float64_t > &statistic, SGVector< float64_t > &variance, bool multiple_kernels=false)=0
virtual float64_t compute_threshold(float64_t alpha)
virtual float64_t compute_p_value(float64_t statistic)
virtual CStreamingFeatures * get_streaming_q()
virtual CStreamingFeatures * get_streaming_p()
int32_t index_t
Definition: common.h:60
static float64_t inverse_normal_cdf(float64_t y0)
virtual CFeatures * get_streamed_features(index_t num_elements)
#define SG_UNREF(x)
Definition: SGRefObject.h:35
virtual int32_t get_num_vectors() const =0
#define SG_ERROR(...)
Definition: SGIO.h:131
#define REQUIRE(x,...)
Definition: SGIO.h:208
void set_simulate_h0(bool simulate_h0)
Definition: StreamingMMD.h:261
Kernel two sample test base class. Provides an interface for performing a two-sample test using a ker...
CSGObject * delete_element()
Definition: List.h:475
void add(const SGVector< T > x)
Definition: SGVector.cpp:329
virtual SGVector< float64_t > sample_null()
virtual float64_t compute_p_value(float64_t statistic)
virtual float64_t compute_variance_estimate()
CStreamingFeatures * m_streaming_q
Definition: StreamingMMD.h:291
virtual CFeatures * create_merged_copy(CList *others)
Definition: Features.h:229
CSGObject * get_first_element()
Definition: List.h:144
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:102
double float64_t
Definition: common.h:48
#define SG_REF(x)
Definition: SGRefObject.h:34
void range_fill(T start=0)
Definition: SGVector.cpp:145
virtual CFeatures * get_p_and_q()
#define SG_DEBUG(...)
Definition: SGIO.h:109
virtual EKernelType get_kernel_type()=0
virtual CFeatures * copy_subset(SGVector< index_t > indices)
Definition: Features.cpp:330
The class Features is the base class of all feature objects.
Definition: Features.h:62
bool append_element(CSGObject *data)
Definition: List.h:323
Streaming features are features which are used for online algorithms.
static float64_t normal_cdf(float64_t x, float64_t std_dev=1)
ENullApproximationMethod m_null_approximation_method
virtual float64_t perform_test()
The Kernel base class.
Definition: Kernel.h:150
CStreamingFeatures * m_streaming_p
Definition: StreamingMMD.h:288
#define SG_ADD(...)
Definition: SGObject.h:71
static float32_t sqrt(float32_t x)
x^0.5
Definition: Math.h:308
void delete_all_elements()
Definition: List.h:116
virtual void set_p_and_q(CFeatures *p_and_q)
CList * stream_data_blocks(index_t num_blocks, index_t num_this_run)
virtual float64_t perform_test()
virtual void add_subset(SGVector< index_t > subset)
Definition: Features.cpp:307
Class List implements a doubly connected list for low-level-objects.
Definition: List.h:82
virtual float64_t compute_statistic()

SHOGUN 机器学习工具包 - 项目文档