SHOGUN  3.2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
Alphabet.h
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2006-2009 Soeren Sonnenburg
8  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
11 #ifndef _CALPHABET__H__
12 #define _CALPHABET__H__
13 
14 #include <shogun/base/SGObject.h>
15 #include <shogun/lib/common.h>
16 
17 namespace shogun
18 {
21 {
23  DNA=0,
24 
26  RAWDNA=1,
27 
29  RNA=2,
30 
33 
34  // BINARY just 0 and 1
35  BINARY=4,
36 
39 
41  CUBE=6,
42 
45 
48 
51 
53  NONE=10,
54 
56  DIGIT=11,
57 
59  DIGIT2=12,
60 
63 
66 
68  UNKNOWN=15,
69 
71  SNP=16,
72 
74  RAWSNP=17
75 };
76 
77 
88 class CAlphabet : public CSGObject
89 {
90  public:
91 
95  CAlphabet();
96 
102  CAlphabet(char* alpha, int32_t len);
103 
108  CAlphabet(EAlphabet alpha);
109 
114  CAlphabet(CAlphabet* alpha);
115  virtual ~CAlphabet();
116 
121  bool set_alphabet(EAlphabet alpha);
122 
127  inline EAlphabet get_alphabet() const
128  {
129  return alphabet;
130  }
131 
136  inline int32_t get_num_symbols() const
137  {
138  return num_symbols;
139  }
140 
146  inline int32_t get_num_bits() const
147  {
148  return num_bits;
149  }
150 
156  inline uint8_t remap_to_bin(uint8_t c)
157  {
158  return maptable_to_bin[c];
159  }
160 
166  inline uint8_t remap_to_char(uint8_t c)
167  {
168  return maptable_to_char[c];
169  }
170 
172  void clear_histogram();
173 
179  template <class T>
180  void add_string_to_histogram(T* p, int64_t len)
181  {
182  for (int64_t i=0; i<len; i++)
183  add_byte_to_histogram((uint8_t) (p[i]));
184  }
185 
190  inline void add_byte_to_histogram(uint8_t p)
191  {
192  histogram[p]++;
193  }
194 
196  void print_histogram();
197 
203 
210  bool check_alphabet(bool print_error=true);
211 
218  inline bool is_valid(uint8_t c)
219  {
220  return valid_chars[c];
221  }
222 
228  bool check_alphabet_size(bool print_error=true);
229 
235 
240  int32_t get_max_value_in_histogram();
241 
248  int32_t get_num_bits_in_histogram();
249 
254  static const char* get_alphabet_name(EAlphabet alphabet);
255 
256 
258  virtual const char* get_name() const { return "Alphabet"; }
259 
268  template <class ST>
269  static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
270 
279  template <class ST>
280  static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val);
281 
291  template <class ST>
292  static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
293 
303  template <class ST>
304  static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap);
305 
306  private:
309  void init();
310 
311  protected:
313  void init_map_table();
314 
319  void copy_histogram(CAlphabet* src);
320 
321  public:
323  static const uint8_t B_A;
325  static const uint8_t B_C;
327  static const uint8_t B_G;
329  static const uint8_t B_T;
331  static const uint8_t B_0;
333  static const uint8_t MAPTABLE_UNDEF;
335  static const char* alphabet_names[18];
336 
337  protected:
346  virtual void load_serializable_post() throw (ShogunException);
347 
348  protected:
352  int32_t num_symbols;
354  int32_t num_bits;
356  bool valid_chars[1 << (sizeof(uint8_t)*8)];
358  uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
360  uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
362  int64_t histogram[1 << (sizeof(uint8_t)*8)];
363 };
364 }
365 #endif
RNA - letters A,C,G,U.
Definition: Alphabet.h:29
bool valid_chars[1<< (sizeof(uint8_t)*8)]
Definition: Alphabet.h:356
RAWDIGIT - 0-9.
Definition: Alphabet.h:62
PROTEIN - letters A-Z.
Definition: Alphabet.h:32
static const uint8_t B_T
Definition: Alphabet.h:329
int32_t get_num_symbols_in_histogram()
Definition: Alphabet.cpp:565
DNA - letters A,C,G,T.
Definition: Alphabet.h:23
ALPHANUM - [0-9A-Z].
Definition: Alphabet.h:38
int32_t get_num_bits_in_histogram()
Definition: Alphabet.cpp:577
static const char * get_alphabet_name(EAlphabet alphabet)
Definition: Alphabet.cpp:669
SNP - letters A,C,G,T,0.
Definition: Alphabet.h:71
static const uint8_t B_G
Definition: Alphabet.h:327
RAWDNA - letters 0,1,2,3.
Definition: Alphabet.h:26
void copy_histogram(CAlphabet *src)
Definition: Alphabet.cpp:656
uint8_t maptable_to_bin[1<< (sizeof(uint8_t)*8)]
Definition: Alphabet.h:358
EAlphabet
Alphabet of charfeatures/observations.
Definition: Alphabet.h:20
int32_t get_max_value_in_histogram()
Definition: Alphabet.cpp:550
bool check_alphabet_size(bool print_error=true)
Definition: Alphabet.cpp:639
static const uint8_t B_0
Definition: Alphabet.h:331
Class ShogunException defines an exception which is thrown whenever an error inside of shogun occurs...
void print_histogram()
print histogram
Definition: Alphabet.cpp:587
EAlphabet get_alphabet() const
Definition: Alphabet.h:127
static const uint8_t B_C
Definition: Alphabet.h:325
static void translate_from_single_order(ST *obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
Definition: Alphabet.cpp:760
The class Alphabet implements an alphabet and alphabet utility functions.
Definition: Alphabet.h:88
virtual const char * get_name() const
Definition: Alphabet.h:258
void add_byte_to_histogram(uint8_t p)
Definition: Alphabet.h:190
IUPAC_AMINO_ACID.
Definition: Alphabet.h:50
static const uint8_t MAPTABLE_UNDEF
Definition: Alphabet.h:333
int32_t num_symbols
Definition: Alphabet.h:352
static const char * alphabet_names[18]
Definition: Alphabet.h:335
virtual void load_serializable_post()
Definition: Alphabet.cpp:749
uint8_t remap_to_bin(uint8_t c)
Definition: Alphabet.h:156
bool is_valid(uint8_t c)
Definition: Alphabet.h:218
void init_map_table()
Definition: Alphabet.cpp:179
virtual ~CAlphabet()
Definition: Alphabet.cpp:104
void add_string_to_histogram(T *p, int64_t len)
Definition: Alphabet.h:180
static void translate_from_single_order_reversed(ST *obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
Definition: Alphabet.cpp:798
DIGIT2 - letters 0-2.
Definition: Alphabet.h:59
EAlphabet alphabet
Definition: Alphabet.h:350
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:102
RAWSNP - letters 0,1,2,3,4.
Definition: Alphabet.h:74
int32_t num_bits
Definition: Alphabet.h:354
int32_t get_num_symbols() const
Definition: Alphabet.h:136
uint8_t maptable_to_char[1<< (sizeof(uint8_t)*8)]
Definition: Alphabet.h:360
bool set_alphabet(EAlphabet alpha)
Definition: Alphabet.cpp:108
NONE - type has no alphabet.
Definition: Alphabet.h:53
IUPAC_NUCLEIC_ACID.
Definition: Alphabet.h:47
void clear_histogram()
clear histogram
Definition: Alphabet.cpp:544
bool check_alphabet(bool print_error=true)
Definition: Alphabet.cpp:617
SGVector< int64_t > get_histogram()
Definition: Alphabet.cpp:612
int32_t get_num_bits() const
Definition: Alphabet.h:146
unknown alphabet
Definition: Alphabet.h:68
RAWDIGIT2 - 0-2.
Definition: Alphabet.h:65
DIGIT - letters 0-9.
Definition: Alphabet.h:56
uint8_t remap_to_char(uint8_t c)
Definition: Alphabet.h:166
CUBE - [1-6].
Definition: Alphabet.h:41
int64_t histogram[1<< (sizeof(uint8_t)*8)]
Definition: Alphabet.h:362
RAW BYTE - [0-255].
Definition: Alphabet.h:44
static const uint8_t B_A
Definition: Alphabet.h:323

SHOGUN Machine Learning Toolbox - Documentation