SHOGUN  3.2.1
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
StringFeatures.h
浏览该文件的文档.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Soeren Sonnenburg
8  * Written (W) 1999-2008 Gunnar Raetsch
9  * Written (W) 2011-2012 Heiko Strathmann
10  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
11  */
12 
13 #ifndef _CSTRINGFEATURES__H__
14 #define _CSTRINGFEATURES__H__
15 
16 #include <shogun/lib/common.h>
17 #include <shogun/lib/Cache.h>
19 #include <shogun/lib/Compressor.h>
20 #include <shogun/io/File.h>
21 
24 
25 namespace shogun
26 {
27 class CAlphabet;
28 template <class T> class CDynamicArray;
29 class CFile;
30 template <class T> class SGString;
31 template <class T> class SGStringList;
32 
33 #ifndef DOXYGEN_SHOULD_SKIP_THIS
34 struct SSKDoubleFeature
35 {
36  int feature1;
37  int feature2;
38  int group;
39 };
40 
41 struct SSKTripleFeature
42 {
43  int feature1;
44  int feature2;
45  int feature3;
46  int group;
47 };
48 #endif
49 
73 template <class ST> class CStringFeatures : public CFeatures
74 {
75  public:
78 
84 
90  CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha);
91 
97  CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha);
98 
103  CStringFeatures(CAlphabet* alpha);
104 
109  CStringFeatures(const CStringFeatures& orig);
110 
116  CStringFeatures(CFile* loader, EAlphabet alpha=DNA);
117 
119  virtual ~CStringFeatures();
120 
126  virtual void cleanup();
127 
134  virtual void cleanup_feature_vector(int32_t num);
135 
143  virtual void cleanup_feature_vectors(int32_t start, int32_t stop);
144 
149  virtual EFeatureClass get_feature_class() const;
150 
155  virtual EFeatureType get_feature_type() const;
156 
161  CAlphabet* get_alphabet();
162 
167  virtual CFeatures* duplicate() const;
168 
176  SGVector<ST> get_feature_vector(int32_t num);
177 
185  void set_feature_vector(SGVector<ST> vector, int32_t num);
186 
189 
194 
205  ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree);
206 
213  CStringFeatures<ST>* get_transposed();
214 
228  SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec);
229 
238  void free_feature_vector(ST* feat_vec, int32_t num, bool dofree);
239 
247  void free_feature_vector(SGVector<ST> feat_vec, int32_t num);
248 
257  virtual ST get_feature(int32_t vec_num, int32_t feat_num);
258 
266  virtual int32_t get_vector_length(int32_t vec_num);
267 
274  virtual int32_t get_max_vector_length();
275 
277  virtual int32_t get_num_vectors() const;
278 
286 
295 
296  // these functions are necessary to find out about a former conversion process
297 
303 
308  int32_t get_order();
309 
317  ST get_masked_symbols(ST symbol, uint8_t mask);
318 
325  ST shift_offset(ST offset, int32_t amount);
326 
333  ST shift_symbol(ST symbol, int32_t amount);
334 
339  virtual void load(CFile* loader);
340 
351  void load_ascii_file(char* fname, bool remap_to_bin=true,
352  EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA);
353 
362  bool load_fasta_file(const char* fname, bool ignore_invalid=false);
363 
373  bool load_fastq_file(const char* fname,
374  bool ignore_invalid=false, bool bitremap_in_single_string=false);
375 
383  bool load_from_directory(char* dirname);
384 
390  void set_features(SGStringList<ST> feats);
391 
401  bool set_features(SGString<ST>* p_features, int32_t p_num_vectors,
402  int32_t p_max_string_length);
403 
412  bool append_features(CStringFeatures<ST>* sf);
413 
426  bool append_features(SGString<ST>* p_features, int32_t p_num_vectors,
427  int32_t p_max_string_length);
428 
432  SGStringList<ST> get_features();
433 
442  virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len);
443 
452  virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len);
453 
461  virtual void get_features(SGString<ST>** dst, int32_t* num_str);
462 
469  virtual void save(CFile* writer);
470 
479  virtual bool load_compressed(char* src, bool decompress);
480 
490  virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level);
491 
497  virtual bool apply_preprocessor(bool force_preprocessing=false);
498 
511  int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0);
512 
523  int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
524  int32_t skip=0);
525 
539  bool obtain_from_char(CStringFeatures<char>* sf, int32_t start,
540  int32_t p_order, int32_t gap, bool rev);
541 
553  template <class CT>
554  bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
555  int32_t p_order, int32_t gap, bool rev);
556 
566  bool have_same_length(int32_t len=-1);
567 
573  void embed_features(int32_t p_order);
574 
581  void compute_symbol_mask_table(int64_t max_val);
582 
589  void unembed_word(ST word, uint8_t* seq, int32_t len);
590 
596  ST embed_word(ST* seq, int32_t len);
597 
603 
611  static ST* get_zero_terminated_string_copy(SGString<ST> str);
612 
621  virtual void set_feature_vector(int32_t num, ST* string, int32_t len);
622 
627  virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols,
628  bool normalize=true);
629 
634  virtual void create_random(float64_t* hist, int32_t rows, int32_t cols,
635  int32_t num_vec);
636 
645  virtual CFeatures* copy_subset(SGVector<index_t> indices);
646 
648  virtual const char* get_name() const { return "StringFeatures"; }
649 
651  virtual void subset_changed_post();
652 
653  protected:
664  virtual ST* compute_feature_vector(int32_t num, int32_t& len);
665 
666  private:
667  void init();
668 
669  protected:
672 
674  int32_t num_vectors;
675 
678 
681 
684 
687 
690 
693 
695  int32_t order;
696 
699 
702 
705 
708 };
709 }
710 #endif // _CSTRINGFEATURES__H__
bool preprocess_on_get
preprocess on-the-fly?
DNA - letters A,C,G,T.
Definition: Alphabet.h:23
bool load_fasta_file(const char *fname, bool ignore_invalid=false)
void set_feature_vector(SGVector< ST > vector, int32_t num)
void set_features(SGStringList< ST > feats)
virtual CFeatures * duplicate() const
RAWDNA - letters 0,1,2,3.
Definition: Alphabet.h:26
virtual EFeatureType get_feature_type() const
virtual SGString< ST > * copy_features(int32_t &num_str, int32_t &max_str_len)
EAlphabet
Alphabet of charfeatures/observations.
Definition: Alphabet.h:20
SGString< ST > * features
ST shift_offset(ST offset, int32_t amount)
CFeatures(int32_t size=0)
Definition: Features.cpp:23
virtual void load(CFile *loader)
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:88
ST get_masked_symbols(ST symbol, uint8_t mask)
void compute_symbol_mask_table(int64_t max_val)
int32_t length_of_single_string
length of prior single string
floatmax_t get_max_num_symbols()
floatmax_t num_symbols
number of used symbols
virtual EFeatureClass get_feature_class() const
bool obtain_from_char_features(CStringFeatures< CT > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
virtual int32_t get_num_vectors() const
virtual void cleanup_feature_vectors(int32_t start, int32_t stop)
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:35
int32_t symbol_mask_table_len
order used in higher order mapping
virtual bool save_compressed(char *dest, E_COMPRESSION_TYPE compression, int level)
virtual const char * get_name() const
shogun string
virtual int32_t get_vector_length(int32_t vec_num)
int32_t order
order used in higher order mapping
bool obtain_from_char(CStringFeatures< char > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
ST shift_symbol(ST symbol, int32_t amount)
virtual void get_histogram(float64_t **hist, int32_t *rows, int32_t *cols, bool normalize=true)
int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
floatmax_t get_original_num_symbols()
double float64_t
Definition: common.h:48
long double floatmax_t
Definition: common.h:49
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
void load_ascii_file(char *fname, bool remap_to_bin=true, EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA)
SGVector< ST > get_feature_vector(int32_t num)
SGStringList< ST > get_features()
int32_t obtain_by_position_list(int32_t window_size, CDynamicArray< int32_t > *positions, int32_t skip=0)
virtual void create_random(float64_t *hist, int32_t rows, int32_t cols, int32_t num_vec)
virtual CFeatures * copy_subset(SGVector< index_t > indices)
CCache< ST > * feature_cache
EFeatureType
shogun feature type
Definition: FeatureTypes.h:16
E_COMPRESSION_TYPE
Definition: Compressor.h:21
void unembed_word(ST word, uint8_t *seq, int32_t len)
bool append_features(CStringFeatures< ST > *sf)
ST * symbol_mask_table
order used in higher order mapping
bool load_fastq_file(const char *fname, bool ignore_invalid=false, bool bitremap_in_single_string=false)
virtual void subset_changed_post()
virtual int32_t get_max_vector_length()
floatmax_t original_num_symbols
original number of used symbols (before higher order mapping)
virtual void cleanup_feature_vector(int32_t num)
virtual bool load_compressed(char *src, bool decompress)
static ST * get_zero_terminated_string_copy(SGString< ST > str)
virtual ST get_feature(int32_t vec_num, int32_t feat_num)
void embed_features(int32_t p_order)
bool have_same_length(int32_t len=-1)
ST embed_word(ST *seq, int32_t len)
virtual void save(CFile *writer)
virtual ST * compute_feature_vector(int32_t num, int32_t &len)
bool load_from_directory(char *dirname)
CStringFeatures< ST > * get_transposed()
virtual bool apply_preprocessor(bool force_preprocessing=false)
template class SGStringList
Definition: SGObject.h:44

SHOGUN 机器学习工具包 - 项目文档