37 using namespace shogun;
50 REQUIRE(data,
"Data required for classification in apply_multiclass\n")
56 for (int32_t i=0; i<num_vecs; i++)
101 REQUIRE(data,
"Data required for training\n")
108 set_root(id3train(data, dynamic_cast<CMulticlassLabels*>(
m_labels), feature_ids, 0));
128 if (feature_id_vector.
vlen == 0)
134 int32_t most_label = labels[0];
135 int32_t most_num = 1;
138 for (int32_t i=1; i<labels.
vlen; i++)
140 while ((labels[i] == labels[i-1]) && (i<labels.
vlen))
149 most_label = labels[i-1];
155 node->data.class_label = most_label;
161 int32_t best_feature_index = -1;
164 float64_t gain = informational_gain_attribute(i,feats,class_labels);
169 best_feature_index = i;
175 for (int32_t i=0; i<num_vecs; i++)
181 for (int32_t i=0; i<best_labels_unique.
vlen; i++)
184 int32_t num_cols = 0;
185 float64_t active_feature_value = best_labels_unique[i];
187 for (int32_t j=0; j<num_vecs; j++)
189 if ( active_feature_value == best_feature_values[j])
198 for (int32_t j=0; j<num_vecs; j++)
201 if (active_feature_value == sample[best_feature_index])
204 for (int32_t k=0; k<sample.
size(); k++)
206 if (k != best_feature_index)
207 mat(++idx, cnt) = sample[k];
210 new_labels_vector[cnt] = class_labels->
get_labels()[j];
218 for (int32_t j=0;j<feature_id_vector.
vlen;j++)
220 if (j!=best_feature_index)
221 new_feature_id_vector[++cnt] = feature_id_vector[j];
227 node_t* child = id3train(new_data, new_class_labels, new_feature_id_vector, level+1);
228 child->data.transit_if_feature_value = active_feature_value;
229 node->data.attribute_id = feature_id_vector[best_feature_index];
230 node->add_child(child);
241 float64_t CID3ClassifierTree::informational_gain_attribute(int32_t attr_no,
CFeatures* data,
244 REQUIRE(data,
"Data required for information gain calculation\n")
246 "Dense data required for information gain calculation\n")
250 int32_t num_vecs = feats->get_num_vectors();
255 for (int32_t i=0; i<num_vecs; i++)
256 attribute_values[i] = (feats->get_feature_vector(i))[attr_no];
261 for (int32_t i=0; i<attr_val_unique.vlen; i++)
264 int32_t attr_count=0;
266 for (int32_t j=0; j<num_vecs; j++)
268 if (attribute_values[j] == attr_val_unique[i])
275 for (int32_t j=0; j<num_vecs; j++)
277 if (attribute_values[j] == attr_val_unique[i])
278 sub_class[count++] = class_labels->
get_label(j);
282 float64_t sub_entropy = entropy(sub_labels);
283 gain += sub_entropy*(attr_count-0.f)/(num_vecs-0.f);
288 float64_t data_entropy = entropy(class_labels);
289 gain = data_entropy-gain;
313 if (log_ratios[i] != 0)
CTreeMachineNode< id3TreeNodeData > node_t
virtual int32_t get_num_labels() const
virtual ~CID3ClassifierTree()
float64_t transit_if_feature_value
SGVector< float64_t > get_unique_labels()
CTreeMachineNode< id3TreeNodeData > * get_root()
int32_t get_num_features() const
int32_t get_num_elements() const
float64_t get_label(int32_t idx)
structure to store data of a node of id3 tree. This can be used as a template type in TreeMachineNode...
void set_root(CTreeMachineNode< id3TreeNodeData > *root)
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
SGVector< float64_t > get_labels()
Multiclass Labels for multi-class classification.
static float64_t entropy(float64_t *p, int32_t len)
void range_fill(T start=0)
virtual EFeatureClass get_feature_class() const =0
Dynamic array class for CSGObject pointers that creates an array that can be used like a list or an a...
virtual CDynamicObjectArray * get_children()
virtual bool train_machine(CFeatures *data=NULL)
The class Features is the base class of all feature objects.
static float64_t log(float64_t v)
CSGObject * get_element(int32_t index) const
class TreeMachine, a base class for tree based multiclass classifiers. This class is derived from CBa...
virtual int32_t get_num_vectors() const
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)