10 #include <sys/types.h> 38 num_symbols=alphabet->get_num_symbols();
39 original_num_symbols=num_symbols;
49 num_symbols=alphabet->get_num_symbols();
50 original_num_symbols=num_symbols;
61 num_symbols=alphabet->get_num_symbols();
62 original_num_symbols=num_symbols;
75 original_num_symbols=num_symbols;
79 :
CFeatures(orig), num_vectors(orig.num_vectors),
80 single_string(orig.single_string),
81 length_of_single_string(orig.length_of_single_string),
82 max_string_length(orig.max_string_length),
83 num_symbols(orig.num_symbols),
84 original_num_symbols(orig.original_num_symbols),
85 order(orig.order), preprocess_on_get(false),
99 for (int32_t i=0; i<num_vectors; i++)
101 features[i].string=SG_MALLOC(ST, orig.
features[i].
slen);
109 symbol_mask_table=SG_MALLOC(ST, 256);
110 symbol_mask_table_len=256;
112 for (int32_t i=0; i<256; i++)
122 features(NULL), single_string(NULL), length_of_single_string(0),
123 max_string_length(0), order(0),
124 preprocess_on_get(false), feature_cache(NULL)
130 num_symbols=alphabet->get_num_symbols();
131 original_num_symbols=num_symbols;
144 remove_all_subsets();
148 SG_FREE(single_string);
152 cleanup_feature_vectors(0, num_vectors-1);
166 SG_FREE(symbol_mask_table);
168 symbol_mask_table=NULL;
182 ASSERT(num<get_num_vectors())
186 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
187 SG_FREE(features[real_num].
string);
188 features[real_num].string=NULL;
189 features[real_num].slen=0;
191 determine_maximum_string_length();
197 if (features && get_num_vectors())
199 ASSERT(start<get_num_vectors())
200 ASSERT(stop<get_num_vectors())
202 for (int32_t i=start; i<=stop; i++)
204 int32_t real_num=m_subset_stack->subset_idx_conversion(i);
205 SG_FREE(features[real_num].
string);
206 features[real_num].string=NULL;
207 features[real_num].slen=0;
209 determine_maximum_string_length();
231 if (num>=get_num_vectors())
233 SG_ERROR(
"Index out of bounds (number of strings %d, you " 234 "requested %d)\n", get_num_vectors(), num);
239 ST* vec=get_feature_vector(num, l, free_vec);
240 ST* dst=SG_MALLOC(ST, l);
241 sg_memcpy(dst, vec, l*
sizeof(ST));
242 free_feature_vector(vec, num, free_vec);
250 if (m_subset_stack->has_subsets())
251 SG_ERROR(
"A subset is set, cannot set feature vector\n")
253 if (num>=num_vectors)
255 SG_ERROR(
"Index out of bounds (number of strings %d, you " 256 "requested %d)\n", num_vectors, num);
260 SG_ERROR(
"String has zero or negative length\n")
262 cleanup_feature_vector(num);
263 features[num].slen=vector.
vlen;
264 features[num].string=SG_MALLOC(ST, vector.
vlen);
265 sg_memcpy(features[num].
string, vector.
vector, vector.
vlen*
sizeof(ST));
267 determine_maximum_string_length();
272 preprocess_on_get=
true;
277 preprocess_on_get=
false;
283 if (num>=get_num_vectors())
284 SG_ERROR(
"Requested feature vector with index %d while total num is", num, get_num_vectors())
286 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
288 if (!preprocess_on_get)
291 len=features[real_num].slen;
292 return features[real_num].string;
296 SG_DEBUG(
"computing feature vector!\n")
297 ST* feat=compute_feature_vector(num, len);
300 if (get_num_preprocessors())
302 ST* tmp_feat_before=feat;
304 for (int32_t i=0; i<get_num_preprocessors(); i++)
309 SG_FREE(tmp_feat_before);
310 tmp_feat_before=feat;
333 num_feat=get_num_vectors();
334 num_vec=get_max_vector_length();
335 ASSERT(have_same_length())
337 SG_DEBUG(
"Allocating memory for transposed string features of size %ld\n",
338 int64_t(num_feat)*num_vec);
342 for (int32_t i=0; i<num_vec; i++)
344 sf[i].
string=SG_MALLOC(ST, num_feat);
348 for (int32_t i=0; i<num_feat; i++)
352 ST* vec=get_feature_vector(i, len, free_vec);
354 for (int32_t j=0; j<num_vec; j++)
355 sf[j].
string[i]=vec[j];
357 free_feature_vector(vec, i, free_vec);
364 if (num>=get_num_vectors())
367 "Trying to access string[%d] but num_str=%d\n", num,
371 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
374 feature_cache->unlock_entry(real_num);
382 if (num>=get_num_vectors())
385 "Trying to access string[%d] but num_str=%d\n", num,
389 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
392 feature_cache->unlock_entry(real_num);
397 ASSERT(vec_num<get_num_vectors())
401 ST* vec=get_feature_vector(vec_num, len, free_vec);
403 ST result=vec[feat_num];
404 free_feature_vector(vec, vec_num, free_vec);
411 ASSERT(vec_num<get_num_vectors())
415 ST* vec=get_feature_vector(vec_num, len, free_vec);
416 free_feature_vector(vec, vec_num, free_vec);
422 return max_string_length;
427 return m_subset_stack->has_subsets() ? m_subset_stack->get_size() : num_vectors;
441 return symbol_mask_table[mask] & symbol;
447 return (offset << (amount*alphabet->get_num_bits()));
453 return (symbol >> (amount*alphabet->get_num_bits()));
459 remove_all_subsets();
461 size_t blocksize=1024*1024;
462 size_t required_blocksize=0;
463 uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
464 uint8_t* overflow=NULL;
465 int32_t overflow_len=0;
472 FILE* f=fopen(fname,
"ro");
479 SG_INFO(
"counting line numbers in file %s\n", fname)
481 size_t old_block_offs=0;
482 fseek(f, 0, SEEK_END);
483 size_t fsize=ftell(f);
489 SG_DEBUG(
"block_size=%ld file_size=%ld\n", blocksize, fsize)
492 while (sz == blocksize)
494 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
495 for (
size_t i=0; i<sz; i++)
498 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
501 required_blocksize=
CMath::max(required_blocksize, block_offs-old_block_offs);
502 old_block_offs=block_offs;
505 SG_PROGRESS(block_offs, 0, fsize, 1,
"COUNTING:\t")
508 SG_INFO(
"found %d strings\n", num_vectors)
510 blocksize=required_blocksize;
511 dummy=SG_MALLOC(uint8_t, blocksize);
512 overflow=SG_MALLOC(uint8_t, blocksize);
518 while (sz == blocksize)
520 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
523 for (
size_t i=0; i<sz; i++)
525 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
527 int32_t len=i-old_sz;
529 max_string_length=
CMath::max(max_string_length, len+overflow_len);
531 features[lines].slen=len;
532 features[lines].string=SG_MALLOC(ST, len);
536 for (int32_t j=0; j<overflow_len; j++)
537 features[lines].
string[j]=alpha->
remap_to_bin(overflow[j]);
538 for (int32_t j=0; j<len; j++)
539 features[lines].
string[j+overflow_len]=alpha->
remap_to_bin(dummy[old_sz+j]);
545 for (int32_t j=0; j<overflow_len; j++)
546 features[lines].
string[j]=overflow[j];
547 for (int32_t j=0; j<len; j++)
548 features[lines].
string[j+overflow_len]=dummy[old_sz+j];
559 SG_PROGRESS(lines, 0, num_vectors, 1,
"LOADING:\t")
562 for (
size_t i=old_sz; i<sz; i++)
563 overflow[i-old_sz]=dummy[i];
565 overflow_len=sz-old_sz;
570 SG_INFO(
"file successfully read\n")
571 SG_INFO(
"max_string_length=%d\n", max_string_length)
572 SG_INFO(
"num_strings=%d\n", num_vectors)
593 num_symbols=alphabet->get_num_symbols();
598 remove_all_subsets();
614 if (len>0 && s[0]==
'>')
619 SG_ERROR(
"No fasta hunks (lines starting with '>') found\n")
624 num_symbols=alphabet->get_num_symbols();
637 int32_t spanned_lines=0;
642 SG_ERROR(
"Error reading fasta entry in line %d len=%ld", 4*i+1, len)
644 if (s[0]==
'>' || offs==f.
get_size())
653 len=fasta_len-spanned_lines;
654 strings[i].
string=SG_MALLOC(ST, len);
657 ST* str=strings[i].
string;
659 SG_DEBUG(
"'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len,
id, (int32_t) len, (int32_t) spanned_lines)
661 for (int32_t j=0; j<fasta_len; j++)
668 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
671 if (uint64_t(idx)>=len)
672 SG_ERROR(
"idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str)
686 return set_features(strings, num, max_len);
690 bool ignore_invalid,
bool bitremap_in_single_string)
692 remove_all_subsets();
704 SG_ERROR(
"Number of lines must be divisible by 4 in fastq files\n")
714 if (bitremap_in_single_string)
717 strings[0].
string=SG_MALLOC(ST, num);
724 original_num_symbols=alphabet->get_num_symbols();
725 str=SG_MALLOC(ST, len);
733 SG_ERROR(
"Error reading 'read' identifier in line %d", 4*i)
737 SG_ERROR(
"Error reading 'read' in line %d len=%ld", 4*i+1, len)
739 if (bitremap_in_single_string)
741 if (len!=(uint64_t) order)
742 SG_ERROR(
"read in line %d not of length %d (is %d)\n", 4*i+1, order, len)
743 for (int32_t j=0; j<order; j++)
744 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
746 strings[0].
string[i]=embed_word(str, order);
750 strings[i].
string=SG_MALLOC(ST, len);
756 for (uint64_t j=0; j<len; j++)
758 if (alphabet->is_valid((uint8_t) s[j]))
766 for (uint64_t j=0; j<len; j++)
774 SG_ERROR(
"Error reading 'read' quality identifier in line %d", 4*i+2)
777 SG_ERROR(
"Error reading 'read' quality in line %d", 4*i+3)
780 if (bitremap_in_single_string)
784 max_string_length=max_len;
792 remove_all_subsets();
794 struct dirent **namelist;
797 SGIO::set_dirname(dirname);
802 TCHAR search_dir[MAX_PATH];
804 LARGE_INTEGER filesize;
805 HANDLE h_find = INVALID_HANDLE_VALUE;
807 StringCchCopy(search_dir, MAX_PATH, dirname);
808 StringCchCat(search_dir, MAX_PATH, TEXT(
"\\*"));
810 h_find = FindFirstFile(search_dir, &ffd);
811 if (INVALID_HANDLE_VALUE == h_find)
813 SG_ERROR(
"Error finding finds in %s\n", dirname)
817 std::vector<struct dirent*> files;
820 if (ffd.dwFileAttributes & FILE_ATTRIBUTE_NORMAL)
822 struct dirent* d = SG_MALLOC(
struct dirent, 1);
823 StringCchCopy(d->d_name, MAX_PATH, ffd.cFileName);
828 while (FindNextFile(h_find, &ffd) != 0);
829 namelist = &files[0];
832 n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
836 SG_ERROR(
"error calling scandir - no files found\n")
850 for (int32_t i=0; i<n; i++)
852 char* fname=SGIO::concat_filename(namelist[i]->d_name);
857 if (!stat(fname, &s) && s.st_size>0)
859 filesize=s.st_size/
sizeof(ST);
861 FILE* f=fopen(fname,
"ro");
864 ST* str=SG_MALLOC(ST, filesize);
865 SG_DEBUG(
"%s:%ld\n", fname, (int64_t) filesize)
866 if (fread(str,
sizeof(ST), filesize, f)!=(
size_t) filesize)
869 strings[num].
slen=filesize;
870 max_len=
CMath::max(max_len, strings[num].slen);
877 SG_ERROR(
"empty or non readable file \'%s\'\n", fname)
879 SG_FREE(namelist[i]);
883 if (num>0 && strings)
885 set_features(strings, num, max_len);
900 if (m_subset_stack->has_subsets())
901 SG_ERROR(
"Cannot call set_features() with subset.\n")
908 for (int32_t i=0; i<p_num_vectors; i++)
924 sg_memcpy(features,p_features,
sizeof(
SGString<ST>)*p_num_vectors);
925 num_vectors = p_num_vectors;
926 max_string_length = p_max_string_length;
941 if (m_subset_stack->has_subsets())
942 SG_ERROR(
"Cannot call set_features() with subset.\n")
947 for (int32_t i=0; i<sf_num_str; i++)
951 new_features[i].
string=SG_MALLOC(ST, length);
952 sg_memcpy(new_features[i].
string, sf->
features[real_i].
string, length);
953 new_features[i].
slen=length;
955 return append_features(new_features, sf_num_str,
961 if (m_subset_stack->has_subsets())
962 SG_ERROR(
"Cannot call set_features() with subset.\n")
965 return set_features(p_features, p_num_vectors, p_max_string_length);
970 for (int32_t i=0; i<p_num_vectors; i++)
979 for (int32_t i=0; i<p_num_vectors; i++)
980 alphabet->add_string_to_histogram( p_features[i].
string, p_features[i].
slen);
982 int32_t old_num_vectors=num_vectors;
983 num_vectors=old_num_vectors+p_num_vectors;
986 for (int32_t i=0; i<num_vectors; i++)
988 if (i<old_num_vectors)
990 new_features[i].
string=features[i].string;
991 new_features[i].
slen=features[i].slen;
995 new_features[i].
string=p_features[i-old_num_vectors].
string;
996 new_features[i].
slen=p_features[i-old_num_vectors].
slen;
1000 SG_FREE(p_features);
1002 this->features=new_features;
1003 max_string_length=
CMath::max(max_string_length, p_max_string_length);
1022 if (m_subset_stack->has_subsets())
1023 SG_ERROR(
"get features() is not possible on subset")
1025 num_str=num_vectors;
1026 max_str_len=max_string_length;
1034 num_str=get_num_vectors();
1035 max_str_len=max_string_length;
1038 for (int32_t i=0; i<num_str; i++)
1042 ST* vec=get_feature_vector(i, len, free_vec);
1043 new_feat[i].
string=SG_MALLOC(ST, len);
1044 new_feat[i].
slen=len;
1045 sg_memcpy(new_feat[i].
string, vec, ((
size_t) len) *
sizeof(ST));
1046 free_feature_vector(vec, i, free_vec);
1055 int32_t max_str_len;
1056 *dst=copy_features(num_vec, max_str_len);
1062 remove_all_subsets();
1066 if (!(file=fopen(src,
"r")))
1072 if (fread(&
id[0],
sizeof(
char), 1, file)!=1)
1075 if (fread(&
id[1],
sizeof(
char), 1, file)!=1)
1078 if (fread(&
id[2],
sizeof(
char), 1, file)!=1)
1081 if (fread(&
id[3],
sizeof(
char), 1, file)!=1)
1087 if (fread(&c,
sizeof(uint8_t), 1, file)!=1)
1088 SG_ERROR(
"failed to read compression type")
1093 if (fread(&a,
sizeof(uint8_t), 1, file)!=1)
1094 SG_ERROR(
"failed to read compression alphabet")
1097 if (fread(&num_vectors,
sizeof(int32_t), 1, file)!=1)
1098 SG_ERROR(
"failed to read compression number of vectors")
1101 if (fread(&max_string_length,
sizeof(int32_t), 1, file)!=1)
1102 SG_ERROR(
"failed to read maximum string length")
1103 ASSERT(max_string_length>0)
1108 for (int32_t i=0; i<num_vectors; i++)
1111 int32_t len_compressed;
1112 if (fread(&len_compressed,
sizeof(int32_t), 1, file)!=1)
1113 SG_ERROR(
"failed to read vector length compressed")
1115 int32_t len_uncompressed;
1116 if (fread(&len_uncompressed,
sizeof(int32_t), 1, file)!=1)
1117 SG_ERROR(
"failed to read vector length uncompressed")
1122 features[i].string=SG_MALLOC(ST, len_uncompressed);
1123 features[i].slen=len_uncompressed;
1124 uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
1125 if (fread(compressed,
sizeof(uint8_t), len_compressed, file)!=(
size_t) len_compressed)
1126 SG_ERROR(
"failed to read compressed data (expected %d bytes)", len_compressed)
1127 uint64_t uncompressed_size=len_uncompressed;
1128 uncompressed_size*=
sizeof(ST);
1129 compressor->
decompress(compressed, len_compressed,
1130 (uint8_t*) features[i].
string, uncompressed_size);
1131 SG_FREE(compressed);
1132 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*
sizeof(ST))
1136 int32_t offs=CMath::ceil(2.0*
sizeof(int32_t)/
sizeof(ST));
1137 features[i].string=SG_MALLOC(ST, len_compressed+offs);
1138 features[i].slen=len_compressed+offs;
1139 int32_t* feat32ptr=((int32_t*) (features[i].
string));
1140 memset(features[i].
string, 0, offs*
sizeof(ST));
1141 feat32ptr[0]=(int32_t) len_compressed;
1142 feat32ptr[1]=(int32_t) len_uncompressed;
1143 uint8_t* compressed=(uint8_t*) (&features[i].
string[offs]);
1144 if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
1145 SG_ERROR(
"failed to read uncompressed data")
1157 if (m_subset_stack->has_subsets())
1158 SG_ERROR(
"save_compressed() is not possible on subset")
1162 if (!(file=fopen(dest,
"wb")))
1168 const char*
id=
"SGV0";
1169 fwrite(&
id[0],
sizeof(
char), 1, file);
1170 fwrite(&
id[1],
sizeof(
char), 1, file);
1171 fwrite(&
id[2],
sizeof(
char), 1, file);
1172 fwrite(&
id[3],
sizeof(
char), 1, file);
1175 uint8_t c=(uint8_t) compression;
1176 fwrite(&c,
sizeof(uint8_t), 1, file);
1178 uint8_t a=(uint8_t) alphabet->get_alphabet();
1179 fwrite(&a,
sizeof(uint8_t), 1, file);
1181 fwrite(&num_vectors,
sizeof(int32_t), 1, file);
1183 fwrite(&max_string_length,
sizeof(int32_t), 1, file);
1186 for (int32_t i=0; i<num_vectors; i++)
1190 ST* vec=get_feature_vector(i, len, vfree);
1192 uint8_t* compressed=NULL;
1193 uint64_t compressed_size=0;
1195 compressor->
compress((uint8_t*) vec, ((uint64_t) len)*
sizeof(ST),
1196 compressed, compressed_size, level);
1198 int32_t len_compressed=(int32_t) compressed_size;
1200 fwrite(&len_compressed,
sizeof(int32_t), 1, file);
1202 fwrite(&len,
sizeof(int32_t), 1, file);
1204 fwrite(compressed, compressed_size, 1, file);
1205 SG_FREE(compressed);
1207 free_feature_vector(vec, i, vfree);
1217 SG_DEBUG(
"force: %d\n", force_preprocessing)
1219 for (int32_t i=0; i<get_num_preprocessors(); i++)
1221 if ( (!is_preprocessed(i) || force_preprocessing) )
1223 set_preprocessed(i);
1241 if (m_subset_stack->has_subsets())
1246 ASSERT(num_vectors==1 || single_string)
1247 ASSERT(max_string_length>=window_size ||
1248 (single_string && length_of_single_string>=window_size));
1253 num_vectors= (length_of_single_string-window_size)/step_size + 1;
1254 else if (num_vectors==1)
1256 num_vectors= (max_string_length-window_size)/step_size + 1;
1257 length_of_single_string=max_string_length;
1262 for (int32_t i=0; i<num_vectors; i++)
1264 f[i].
string=&features[0].string[offs+skip];
1265 f[i].
slen=window_size-skip;
1268 single_string=features[0].string;
1271 max_string_length=window_size-skip;
1279 if (m_subset_stack->has_subsets())
1284 ASSERT(num_vectors==1 || single_string)
1285 ASSERT(max_string_length>=window_size ||
1286 (single_string && length_of_single_string>=window_size));
1296 len=length_of_single_string;
1299 single_string=features[0].string;
1300 len=max_string_length;
1301 length_of_single_string=max_string_length;
1305 for (int32_t i=0; i<num_vectors; i++)
1309 if (p>=0 && p<=len-window_size)
1311 f[i].
string=&features[0].string[p+skip];
1312 f[i].
slen=window_size-skip;
1317 max_string_length=len;
1318 features[0].slen=len;
1321 SG_ERROR(
"window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
1322 window_size, i, p, len);
1329 max_string_length=window_size-skip;
1336 return obtain_from_char_features(sf, start, p_order, gap, rev);
1343 if (len!=max_string_length)
1346 len=max_string_length;
1348 index_t num_str=get_num_vectors();
1349 for (int32_t i=0; i<num_str; i++)
1351 if (get_vector_length(i)!=len)
1360 if (m_subset_stack->has_subsets())
1363 ASSERT(alphabet->get_num_symbols_in_histogram() > 0)
1366 original_num_symbols=alphabet->get_num_symbols();
1367 int32_t max_val=alphabet->get_num_bits();
1372 num_symbols=original_num_symbols;
1374 SG_INFO(
"max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
1377 SG_WARNING(
"symbols did not fit into datatype \"%c\" (%d)\n", (
char) max_val, (
int) max_val)
1380 for (int32_t i=0; i<p_order*max_val; i++)
1381 mask= (mask<<1) | ((ST) 1);
1383 for (int32_t i=0; i<num_vectors; i++)
1385 int32_t len=features[i].slen;
1388 SG_ERROR(
"Sequence must be longer than order (%d vs. %d)\n", len, p_order)
1390 ST* str=features[i].string;
1393 for (int32_t j=0; j<p_order; j++)
1394 str[j]=(ST) alphabet->remap_to_bin(str[j]);
1395 str[0]=embed_word(&str[0], p_order);
1399 for (int32_t j=p_order; j<len; j++)
1401 str[j]=(ST) alphabet->remap_to_bin(str[j]);
1402 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
1406 features[i].slen=len-p_order+1;
1409 compute_symbol_mask_table(max_val);
1414 if (m_subset_stack->has_subsets())
1417 SG_FREE(symbol_mask_table);
1418 symbol_mask_table=SG_MALLOC(ST, 256);
1419 symbol_mask_table_len=256;
1422 for (int32_t i=0; i< (int64_t) max_val; i++)
1425 for (int32_t i=0; i<256; i++)
1427 uint8_t bits=(uint8_t) i;
1428 symbol_mask_table[i]=0;
1430 for (int32_t j=0; j<8; j++)
1433 symbol_mask_table[i]|=mask<<(max_val*j);
1442 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1445 for (uint32_t i=0; i<nbits; i++)
1446 mask=(mask<<1) | (ST) 1;
1448 for (int32_t i=0; i<len; i++)
1451 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
1459 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1460 for (int32_t i=0; i<len; i++)
1471 max_string_length=0;
1472 index_t num_str=get_num_vectors();
1474 for (int32_t i=0; i<num_str; i++)
1476 max_string_length=
CMath::max(max_string_length,
1477 features[m_subset_stack->subset_idx_conversion(i)].slen);
1484 ST* s=SG_MALLOC(ST, l+1);
1485 sg_memcpy(s, str.
string,
sizeof(ST)*l);
1493 ASSERT(num<get_num_vectors())
1495 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1498 features[real_num].slen=len ;
1499 features[real_num].string=string ;
1501 max_string_length=
CMath::max(len, max_string_length);
1506 int32_t nsym=get_num_symbols();
1507 int32_t slen=get_max_vector_length();
1508 int64_t sz=int64_t(nsym)*slen*
sizeof(
float64_t);
1513 memset(h_normalizer, 0, slen*
sizeof(
float64_t));
1514 int32_t num_str=get_num_vectors();
1515 for (int32_t i=0; i<num_str; i++)
1519 ST* vec=get_feature_vector(i, len, free_vec);
1520 for (int32_t j=0; j<len; j++)
1522 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
1525 free_feature_vector(vec, i, free_vec);
1530 for (int32_t i=0; i<slen; i++)
1532 for (int32_t j=0; j<nsym; j++)
1534 if (h_normalizer && h_normalizer[i])
1535 h[int64_t(i)*nsym+j]/=h_normalizer[i];
1539 SG_FREE(h_normalizer);
1548 ASSERT(rows == get_num_symbols())
1553 for (int32_t i=0; i<num_vec; i++)
1555 sf[i].
string=SG_MALLOC(ST, cols);
1560 for (int32_t j=0; j<cols; j++)
1565 for (c=0; c<rows-1; c++)
1567 if (randoms[j]<=lik)
1569 lik+=hist[int64_t(j)*rows+c+1];
1571 sf[i].
string[j]=alphabet->remap_to_char(c);
1575 set_features(sf, num_vec, cols);
1658 index_t real_idx=m_subset_stack->subset_idx_conversion(indices.
vector[i]);
1664 current_string.
slen*
sizeof(ST));
1665 list_copy.
strings[i]=string_copy;
1675 result->
order=order;
1686 determine_maximum_string_length();
1691 ASSERT(features && num<get_num_vectors())
1693 int32_t real_num=m_subset_stack->subset_idx_conversion(num);
1695 len=features[real_num].slen;
1699 ST* target=SG_MALLOC(ST, len);
1700 sg_memcpy(target, features[real_num].
string, len*
sizeof(ST));
1712 length_of_single_string=0;
1713 max_string_length=0;
1715 preprocess_on_get=
false;
1717 symbol_mask_table=NULL;
1718 symbol_mask_table_len=0;
1720 original_num_symbols=0;
1722 m_parameters->add((
CSGObject**) &alphabet,
"alphabet");
1723 m_parameters->add_vector(&features, &num_vectors,
"features",
1724 "This contains the array of features.");
1725 m_parameters->add_vector(&single_string,
1726 &length_of_single_string,
1728 "Created by sliding window.");
1729 m_parameters->add(&max_string_length,
"max_string_length",
1730 "Length of longest string.");
1731 m_parameters->add(&num_symbols,
"num_symbols",
1732 "Number of used symbols.");
1733 m_parameters->add(&original_num_symbols,
"original_num_symbols",
1734 "Original number of used symbols.");
1735 m_parameters->add(&order,
"order",
1736 "Order used in higher order mapping.");
1737 m_parameters->add(&preprocess_on_get,
"preprocess_on_get",
1738 "Preprocess on-the-fly?");
1740 m_parameters->add_vector(&symbol_mask_table, &symbol_mask_table_len,
"mask_table",
"Symbol mask table - using in higher order mapping");
1959 #define LOAD(f_load, sg_type) \ 1960 template<> void CStringFeatures<sg_type>::load(CFile* loader) \ 1962 SG_INFO("loading...\n") \ 1965 SGString<sg_type>* strs; \ 1968 loader->f_load(strs, num_str, max_len); \ 1969 set_features(strs, num_str, max_len); \ 1973 LOAD(get_string_list,
bool)
1974 LOAD(get_string_list,
char)
1975 LOAD(get_string_list, int8_t)
1976 LOAD(get_string_list, uint8_t)
1977 LOAD(get_string_list, int16_t)
1978 LOAD(get_string_list, uint16_t)
1979 LOAD(get_string_list, int32_t)
1980 LOAD(get_string_list, uint32_t)
1981 LOAD(get_string_list, int64_t)
1982 LOAD(get_string_list, uint64_t)
1988 #define SAVE(f_write, sg_type) \ 1989 template<> void CStringFeatures<sg_type>::save(CFile* writer) \ 1991 if (m_subset_stack->has_subsets()) \ 1992 SG_ERROR("save() is not possible on subset") \ 1995 writer->f_write(features, num_vectors); \ 1999 SAVE(set_string_list,
bool)
2000 SAVE(set_string_list,
char)
2001 SAVE(set_string_list, int8_t)
2002 SAVE(set_string_list, uint8_t)
2003 SAVE(set_string_list, int16_t)
2004 SAVE(set_string_list, uint16_t)
2005 SAVE(set_string_list, int32_t)
2006 SAVE(set_string_list, uint32_t)
2007 SAVE(set_string_list, int64_t)
2008 SAVE(set_string_list, uint64_t)
2014 template <
class ST>
template <
class CT>
2016 int32_t p_order, int32_t gap,
bool rev)
2018 remove_all_subsets();
2024 this->order=p_order;
2035 for (int32_t i=0; i<num_vectors; i++)
2042 features[i].string=SG_MALLOC(ST, len);
2043 features[i].slen=len;
2045 ST* str=features[i].string;
2046 for (int32_t j=0; j<len; j++)
2058 num_symbols=original_num_symbols;
2059 SG_INFO(
"max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols)
2063 SG_ERROR(
"symbol does not fit into datatype \"%c\" (%d)\n", (
char) max_val, (
int) max_val)
2067 SG_DEBUG(
"translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap,
sizeof(ST))
2068 for (int32_t line=0; line<num_vectors; line++)
2072 ST* fv=get_feature_vector(line, len, vfree);
2076 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
2078 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
2081 features[line].slen-=start+gap ;
2082 if (features[line].slen<0)
2083 features[line].slen=0 ;
2086 compute_symbol_mask_table(max_val);
CSubsetStack * m_subset_stack
int32_t get_num_symbols_in_histogram()
void determine_maximum_string_length()
int32_t get_num_symbols() const
EAlphabet
Alphabet of charfeatures/observations.
int32_t get_max_value_in_histogram()
bool check_alphabet_size(bool print_error=true)
SGString< ST > * features
#define SAVE(f_write, sg_type)
char * get_line(uint64_t &len, uint64_t &offs)
int32_t get_num_bits() const
#define SG_NOTIMPLEMENTED
virtual int32_t get_num_vectors() const
The class Alphabet implements an alphabet and alphabet utility functions.
Compression library for compressing and decompressing buffers using one of the standard compression a...
void compute_symbol_mask_table(int64_t max_val)
uint8_t remap_to_bin(uint8_t c)
EFeatureClass
shogun feature class
virtual const char * get_name() const
return the name of the preprocessor
int32_t order
order used in higher order mapping
void add_string_to_histogram(T *p, int64_t len)
Template class StringFeatures implements a list of strings.
Class SGObject is the base class of all shogun objects.
#define LOAD(f_load, sg_type)
virtual ST * apply_to_string(ST *f, int32_t &len)=0
apply preproc on single feature vector
A File access base class.
index_t max_string_length
SGVector< ST > get_feature_vector(int32_t num)
floatmax_t get_num_symbols()
Template class StringPreprocessor, base class for preprocessors (cf. CPreprocessor) that apply to CSt...
bool check_alphabet(bool print_error=true)
int32_t get_num_elements() const
EFeatureType
shogun feature type
void compress(uint8_t *uncompressed, uint64_t uncompressed_size, uint8_t *&compressed, uint64_t &compressed_size, int32_t level=1)
all of classes and functions are contained in the shogun namespace
const T & get_element(int32_t idx1, int32_t idx2=0, int32_t idx3=0) const
ST * symbol_mask_table
order used in higher order mapping
The class Features is the base class of all feature objects.
void decompress(uint8_t *compressed, uint64_t compressed_size, uint8_t *uncompressed, uint64_t &uncompressed_size)
int32_t max_string_length
virtual int32_t get_max_vector_length()
CAlphabet * get_alphabet()
T max(const Container< T > &a)
virtual bool apply_to_string_features(CFeatures *f)=0
template class SGStringList