35 CFile(fname, rw, name)
44 SG_UNREF(m_delimiter_feat_tokenizer);
45 SG_UNREF(m_delimiter_label_tokenizer);
51 void CLibSVMFile::init()
55 m_whitespace_tokenizer=NULL;
56 m_delimiter_feat_tokenizer=NULL;
57 m_delimiter_label_tokenizer=NULL;
58 m_line_tokenizer=NULL;
63 void CLibSVMFile::init_with_defaults()
66 m_delimiter_label=
',';
70 SG_REF(m_whitespace_tokenizer);
73 m_delimiter_feat_tokenizer->
delimiters[m_delimiter_feat]=1;
74 SG_REF(m_delimiter_feat_tokenizer);
77 m_delimiter_label_tokenizer->
delimiters[m_delimiter_label]=1;
78 SG_REF(m_delimiter_label_tokenizer);
88 #define GET_SPARSE_MATRIX(read_func, sg_type) \ 89 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& mat_feat, int32_t& num_feat, int32_t& num_vec) \ 91 SGVector<float64_t>* multilabel; \ 92 int32_t num_classes; \ 93 get_sparse_matrix(mat_feat, num_feat, num_vec, multilabel, num_classes, false); \ 109 #undef GET_SPARSE_MATRIX 111 #define GET_LABELED_SPARSE_MATRIX(read_func, sg_type) \ 112 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& mat_feat, int32_t& num_feat, int32_t& num_vec, \ 113 float64_t*& labels, bool load_labels) \ 115 SGVector<float64_t>* multilabel; \ 116 int32_t num_classes; \ 117 get_sparse_matrix(mat_feat, num_feat, num_vec, multilabel, num_classes, load_labels); \ 119 for (int32_t i=0; i<num_vec; i++) \ 121 REQUIRE(multilabel[i].size()==1, \ 122 "%s a multilabel file. You are trying to read it with a single-label reader.", filename); \ 124 labels=SG_MALLOC(float64_t, num_vec); \ 126 for (int32_t i=0; i<num_vec; i++) \ 127 labels[i]=multilabel[i][0]; \ 128 SG_FREE(multilabel); \ 144 #undef GET_LABELED_SPARSE_MATRIX 146 #define GET_MULTI_LABELED_SPARSE_MATRIX(read_func, sg_type) \ 147 void CLibSVMFile::get_sparse_matrix(SGSparseVector<sg_type>*& mat_feat, int32_t& num_feat, int32_t& num_vec, \ 148 SGVector<float64_t>*& multilabel, int32_t& num_classes, bool load_labels) \ 152 SG_INFO("counting line numbers in file %s\n", filename) \ 153 num_vec=get_num_lines(); \ 155 int32_t current_line_ind=0; \ 156 SGVector<char> line; \ 158 int32_t num_feat_entries=0; \ 159 DynArray<SGVector<char> > entries_feat; \ 160 DynArray<float64_t > entries_label; \ 161 DynArray<float64_t> classes; \ 163 mat_feat=SG_MALLOC(SGSparseVector<sg_type>, num_vec); \ 164 multilabel=SG_MALLOC(SGVector<float64_t>, num_vec); \ 169 while (m_line_reader->has_next()) \ 171 num_feat_entries=0; \ 172 entries_feat.reset(SGVector<char>(false)); \ 173 line=m_line_reader->read_line(); \ 175 m_parser->set_tokenizer(m_whitespace_tokenizer); \ 176 m_parser->set_text(line); \ 178 SGVector<char> entry_label; \ 179 if (load_labels && m_parser->has_next()) \ 181 entry_label=m_parser->read_string(); \ 182 if (is_feat_entry(entry_label)) \ 184 entries_feat.push_back(entry_label); \ 185 num_feat_entries++; \ 186 entry_label=SGVector<char>(0); \ 190 while (m_parser->has_next()) \ 192 entries_feat.push_back(m_parser->read_string()); \ 193 num_feat_entries++; \ 196 mat_feat[current_line_ind]=SGSparseVector<sg_type>(num_feat_entries); \ 197 for (int32_t i=0; i<num_feat_entries; i++) \ 199 m_parser->set_tokenizer(m_delimiter_feat_tokenizer); \ 200 m_parser->set_text(entries_feat[i]); \ 202 int32_t feat_index=0; \ 204 if (m_parser->has_next()) \ 205 feat_index=m_parser->read_int(); \ 209 if (m_parser->has_next()) \ 210 entry=m_parser->read_func(); \ 212 if (feat_index>num_feat) \ 213 num_feat=feat_index; \ 215 mat_feat[current_line_ind].features[i].feat_index=feat_index-1; \ 216 mat_feat[current_line_ind].features[i].entry=entry; \ 221 m_parser->set_tokenizer(m_delimiter_label_tokenizer); \ 222 m_parser->set_text(entry_label); \ 224 int32_t num_label_entries=0; \ 225 entries_label.reset(0); \ 227 while (m_parser->has_next()) \ 229 num_label_entries++; \ 230 float64_t label_val=m_parser->read_real(); \ 232 if (classes.find_element(label_val)==-1) \ 233 classes.push_back(label_val); \ 235 entries_label.push_back(label_val); \ 237 multilabel[current_line_ind]=SGVector<float64_t>(num_label_entries); \ 239 for (int32_t j=0; j < num_label_entries; j++) \ 240 multilabel[current_line_ind][j]=entries_label[j]; \ 244 current_line_ind++; \ 245 SG_PROGRESS(current_line_ind, 0, num_vec, 1, "LOADING:\t") \ 247 num_classes=classes.get_num_elements(); \ 251 SG_INFO("file successfully read\n") \ 267 #undef GET_MULTI_LABELED_SPARSE_MATRIX 269 #define SET_SPARSE_MATRIX(format, sg_type) \ 270 void CLibSVMFile::set_sparse_matrix( \ 271 const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec) \ 273 SGVector <float64_t>* labels = NULL; \ 274 set_sparse_matrix(matrix, num_feat, num_vec, labels); \ 290 #undef SET_SPARSE_MATRIX 292 #define SET_LABELED_SPARSE_MATRIX(format, sg_type) \ 293 void CLibSVMFile::set_sparse_matrix( \ 294 const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec, \ 295 const float64_t* labels) \ 297 SGVector<float64_t>* multilabel=SG_MALLOC(SGVector<float64_t>, num_vec); \ 299 for (int32_t i=0; i<num_vec; i++) \ 301 multilabel[i]=SGVector<float64_t>(1); \ 302 multilabel[i][0]=labels[i]; \ 305 set_sparse_matrix(matrix, num_feat, num_vec, multilabel); \ 306 SG_FREE(multilabel); \ 322 #undef SET_LABELED_SPARSE_MATRIX 324 #define SET_MULTI_LABELED_SPARSE_MATRIX(format, sg_type) \ 325 void CLibSVMFile::set_sparse_matrix( \ 326 const SGSparseVector<sg_type>* matrix, int32_t num_feat, int32_t num_vec, \ 327 const SGVector<float64_t>* multilabel) \ 331 for (int32_t i=0; i<num_vec; i++) \ 333 if (multilabel!=NULL) \ 335 if (multilabel[i].size()==0) \ 336 fprintf(file, " "); \ 338 for (int32_t j=0; j <multilabel[i].size(); j++) \ 340 fprintf(file, "%lg", multilabel[i][j]); \ 342 if (j==multilabel[i].size()-1) \ 343 fprintf(file, " "); \ 345 fprintf(file, ","); \ 349 for (int32_t j=0; j<matrix[i].num_feat_entries; j++) \ 351 fprintf(file, "%d%c%" format " ", \ 352 matrix[i].features[j].feat_index+1, \ 354 matrix[i].features[j].entry); \ 356 fprintf(file, "\n"); \ 375 #undef SET_MULTI_LABELED_SPARSE_MATRIX 377 int32_t CLibSVMFile::get_num_lines()
385 m_line_reader->
reset();
void set_text(SGVector< char > text)
#define SET_LABELED_SPARSE_MATRIX(format, sg_type)
#define GET_SPARSE_MATRIX(read_func, sg_type)
#define SET_SPARSE_MATRIX(format, sg_type)
virtual float64_t read_real()
Class for buffered reading from a ascii file.
#define GET_LABELED_SPARSE_MATRIX(read_func, sg_type)
Class for reading from a string.
A File access base class.
void set_tokenizer(CTokenizer *tokenizer)
all of classes and functions are contained in the shogun namespace
The class CDelimiterTokenizer is used to tokenize a SGVector<char> into tokens using custom chars as ...
SGVector< bool > delimiters
#define GET_MULTI_LABELED_SPARSE_MATRIX(read_func, sg_type)
#define SET_MULTI_LABELED_SPARSE_MATRIX(format, sg_type)