13 #ifndef _CSTRINGFEATURES__H__ 14 #define _CSTRINGFEATURES__H__ 16 #include <shogun/lib/config.h> 31 template <
class T>
class CDynamicArray;
33 template <
class T>
class SGString;
34 template <
class T>
class SGStringList;
36 #ifndef DOXYGEN_SHOULD_SKIP_THIS 37 struct SSKDoubleFeature
44 struct SSKTripleFeature
76 template <
class ST>
class CStringFeatures :
public CFeatures
93 CStringFeatures(SGStringList<ST> string_list,
EAlphabet alpha);
100 CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha);
106 CStringFeatures(CAlphabet* alpha);
112 CStringFeatures(
const CStringFeatures& orig);
122 virtual ~CStringFeatures();
129 virtual void cleanup();
137 virtual void cleanup_feature_vector(int32_t num);
146 virtual void cleanup_feature_vectors(int32_t start, int32_t stop);
164 CAlphabet* get_alphabet();
170 virtual CFeatures* duplicate()
const;
179 SGVector<ST> get_feature_vector(int32_t num);
188 void set_feature_vector(SGVector<ST> vector, int32_t num);
191 void enable_on_the_fly_preprocessing();
196 void disable_on_the_fly_preprocessing();
208 ST* get_feature_vector(int32_t num, int32_t& len,
bool& dofree);
216 CStringFeatures<ST>* get_transposed();
231 SGString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec);
241 void free_feature_vector(ST* feat_vec, int32_t num,
bool dofree);
250 void free_feature_vector(SGVector<ST> feat_vec, int32_t num);
260 virtual ST get_feature(int32_t vec_num, int32_t feat_num);
269 virtual int32_t get_vector_length(int32_t vec_num);
277 virtual int32_t get_max_vector_length();
280 virtual int32_t get_num_vectors()
const;
320 ST get_masked_symbols(ST symbol, uint8_t mask);
328 ST shift_offset(ST offset, int32_t amount);
336 ST shift_symbol(ST symbol, int32_t amount);
342 virtual void load(CFile* loader);
354 void load_ascii_file(
char* fname,
bool remap_to_bin=
true,
365 bool load_fasta_file(
const char* fname,
bool ignore_invalid=
false);
376 bool load_fastq_file(
const char* fname,
377 bool ignore_invalid=
false,
bool bitremap_in_single_string=
false);
386 bool load_from_directory(
char* dirname);
393 void set_features(SGStringList<ST> feats);
404 bool set_features(SGString<ST>* p_features, int32_t p_num_vectors,
405 int32_t p_max_string_length);
415 bool append_features(CStringFeatures<ST>* sf);
429 bool append_features(SGString<ST>* p_features, int32_t p_num_vectors,
430 int32_t p_max_string_length);
435 SGStringList<ST> get_features();
445 virtual SGString<ST>* get_features(int32_t& num_str, int32_t& max_str_len);
455 virtual SGString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len);
464 virtual void get_features(SGString<ST>** dst, int32_t* num_str);
472 virtual void save(CFile* writer);
482 virtual bool load_compressed(
char* src,
bool decompress);
500 virtual bool apply_preprocessor(
bool force_preprocessing=
false);
514 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0);
526 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
542 bool obtain_from_char(CStringFeatures<char>* sf, int32_t start,
543 int32_t p_order, int32_t gap,
bool rev);
557 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start,
558 int32_t p_order, int32_t gap,
bool rev);
569 bool have_same_length(int32_t len=-1);
576 void embed_features(int32_t p_order);
584 void compute_symbol_mask_table(int64_t max_val);
592 void unembed_word(ST word, uint8_t* seq, int32_t len);
599 ST embed_word(ST* seq, int32_t len);
605 void determine_maximum_string_length();
614 static ST* get_zero_terminated_string_copy(SGString<ST> str);
624 virtual void set_feature_vector(int32_t num, ST*
string, int32_t len);
630 virtual void get_histogram(
float64_t** hist, int32_t* rows, int32_t* cols,
631 bool normalize=
true);
637 virtual void create_random(
float64_t* hist, int32_t rows, int32_t cols,
648 virtual CFeatures* copy_subset(SGVector<index_t> indices);
651 virtual const char*
get_name()
const {
return "StringFeatures"; }
654 virtual void subset_changed_post();
667 virtual ST* compute_feature_vector(int32_t num, int32_t& len);
713 #endif // _CSTRINGFEATURES__H__
bool preprocess_on_get
preprocess on-the-fly?
RAWDNA - letters 0,1,2,3.
virtual const char * get_name() const
EAlphabet
Alphabet of charfeatures/observations.
SGString< ST > * features
The class Alphabet implements an alphabet and alphabet utility functions.
int32_t length_of_single_string
length of prior single string
floatmax_t num_symbols
number of used symbols
EFeatureClass
shogun feature class
int32_t symbol_mask_table_len
order used in higher order mapping
int32_t order
order used in higher order mapping
CCache< ST > * feature_cache
EFeatureType
shogun feature type
all of classes and functions are contained in the shogun namespace
ST * symbol_mask_table
order used in higher order mapping
int32_t max_string_length
floatmax_t original_num_symbols
original number of used symbols (before higher order mapping)