89 int32_t aa_to_index[128];
90 aa_to_index[(uint8_t)
'A'] = 0;
91 aa_to_index[(uint8_t)
'R'] = 1;
92 aa_to_index[(uint8_t)
'N'] = 2;
93 aa_to_index[(uint8_t)
'D'] = 3;
94 aa_to_index[(uint8_t)
'C'] = 4;
95 aa_to_index[(uint8_t)
'Q'] = 5;
96 aa_to_index[(uint8_t)
'E'] = 6;
97 aa_to_index[(uint8_t)
'G'] = 7;
98 aa_to_index[(uint8_t)
'H'] = 8;
99 aa_to_index[(uint8_t)
'I'] = 9;
100 aa_to_index[(uint8_t)
'L'] = 10;
101 aa_to_index[(uint8_t)
'K'] = 11;
102 aa_to_index[(uint8_t)
'M'] = 12;
103 aa_to_index[(uint8_t)
'F'] = 13;
104 aa_to_index[(uint8_t)
'P'] = 14;
105 aa_to_index[(uint8_t)
'S'] = 15;
106 aa_to_index[(uint8_t)
'T'] = 16;
107 aa_to_index[(uint8_t)
'W'] = 17;
108 aa_to_index[(uint8_t)
'Y'] = 18;
109 aa_to_index[(uint8_t)
'V'] = 19;
110 SG_DEBUG(
"initializing background\n")
111 double background[20];
112 background[0]=0.0799912015849807;
113 background[1]=0.0484482507611578;
114 background[2]=0.044293531582512;
115 background[3]=0.0578891399707563;
116 background[4]=0.0171846021407367;
117 background[5]=0.0380578923048682;
118 background[6]=0.0638169929675978;
119 background[7]=0.0760659374742852;
120 background[8]=0.0223465499452473;
121 background[9]=0.0550905793661343;
122 background[10]=0.0866897071203864;
123 background[11]=0.060458245507428;
124 background[12]=0.0215379186368154;
125 background[13]=0.0396348024787477;
126 background[14]=0.0465746314476874;
127 background[15]=0.0630028230885602;
128 background[16]=0.0580394726014824;
129 background[17]=0.0144991866213453;
130 background[18]=0.03635438623143;
131 background[19]=0.0700241481678408;
134 std::vector<std::string> seqs;
138 const char *filename=
"/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles";
139 std::ifstream fin(filename);
141 SG_DEBUG(
"Reading profiles from %s\n", filename)
145 std::getline(fin, line);
149 int idx = line.find_first_of(
' ');
151 std::getline(fin, line);
152 std::string orig_sequence = line;
153 std::string sequence=
"";
155 int len_line = line.length();
159 std::getline(fin, line);
160 std::getline(fin, line);
161 std::getline(fin, line);
163 profiles.push_back(std::vector<double>());
165 std::vector<double>& curr_profile =
profiles.back();
166 for (
int i=0; i < len_line; ++i)
168 std::getline(fin, line);
169 int a = line.find_first_not_of(
' ');
170 int b = line.find_first_of(
' ', a);
171 a = line.find_first_not_of(
' ', b);
172 b = line.find_first_of(
' ', a);
173 std::string aa=line.substr(a,b-a);
176 int pos = seqs.size()+1;
177 SG_DEBUG(
"Skipping aa in sequence %d\n", pos)
184 a = line.find_first_not_of(
' ', b);
185 b = line.find_first_of(
' ', a);
187 for (
int j=0; j < 19; ++j)
189 a = line.find_first_not_of(
' ', b);
190 b = line.find_first_of(
' ', a);
195 for (
int j=0; j < 20; ++j)
197 a = line.find_first_not_of(
' ', b);
198 b = line.find_first_of(
' ', a);
199 double p = atof(line.substr(a, b-a).c_str());
204 double value = -1* std::log(C*(p/100)+(1-C)*background[j]);
205 curr_profile.push_back(value);
211 SG_DEBUG(
">>>>>>>>>>>>>>> all zeros")
212 if (aa !=
"B" && aa !=
"X" && aa !=
"Z")
215 int32_t aa_index = aa_to_index[(int)aa.c_str()[0]];
216 double value = -1* std::log(C+(1-C)*background[aa_index]);
218 curr_profile[(i*20) + aa_index] = value;
219 SG_DEBUG(
">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value)
233 if (curr_profile.size() != 20 * sequence.length())
235 SG_ERROR(
"Something's wrong with the profile.\n")
239 seqs.push_back(sequence);
261 int len = seqs[i].length();
264 strcpy(
sequences[i].
string, seqs[i].c_str());
266 if (len > max_len) max_len = len;
284 int32_t lhs_changed=(
lhs!=l);
285 int32_t rhs_changed=(
rhs!=r);
289 SG_DEBUG(
"lhs_changed: %i\n", lhs_changed)
290 SG_DEBUG(
"rhs_changed: %i\n", rhs_changed)
320 if (c<65 || c>89 || c==
'B' || c==
'J' || c==
'O' || c==
'U' || c==
'X' || c==
'Z')
330 for (
int i=0; i<seq_degree; i++)
332 if (!
isaa(path[i])||!
isaa(joint_seq[index+i]))
337 diff -= 2*
AA_matrix.
matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ;
338 diff +=
AA_matrix.
matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ;
340 fprintf(stderr,
"nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ;
344 return exp( - diff/
width) ;
356 for (int32_t i=0; i<alen; i++)
358 for (int32_t j=0; j<blen; j++)
402 void CSpectrumRBFKernel::init()
virtual bool init(CFeatures *l, CFeatures *r)
The class Alphabet implements an alphabet and alphabet utility functions.
int32_t max_sequence_length
float64_t AA_helper(const char *path, const int degree, const char *joint_seq, unsigned int index)
CStringFeatures< char > * string_features
bool set_AA_matrix(float64_t *AA_matrix_)
void read_profiles_and_sequences()
Class SGObject is the base class of all shogun objects.
SGString< char > * sequences
std::vector< std::vector< float64_t > > profiles
index_t max_string_length
virtual void register_param()
virtual bool init_normalizer()
CFeatures * rhs
feature vectors to occur on right hand side
void add_vector(bool **param, index_t *length, const char *name, const char *description="")
all of classes and functions are contained in the shogun namespace
CFeatures * lhs
feature vectors to occur on left hand side
static int is_nan(double f)
checks whether a float is nan
The class Features is the base class of all feature objects.
virtual ~CSpectrumRBFKernel()
CAlphabet * get_alphabet()
float64_t compute(int32_t idx_a, int32_t idx_b)
Template class StringKernel, is the base class of all String Kernels.
EAlphabet get_alphabet() const
template class SGStringList
SGMatrix< float64_t > AA_matrix
std::vector< std::string > sequence_labels