Home  · Classes  · Annotated Classes  · Modules  · Members  · Namespaces  · Related Pages
SpectraMerger.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2015.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Chris Bielow, Andreas Bertsch $
33 // --------------------------------------------------------------------------
34 //
35 #ifndef OPENMS_FILTERING_TRANSFORMERS_SPECTRAMERGER_H
36 #define OPENMS_FILTERING_TRANSFORMERS_SPECTRAMERGER_H
37 
48 #include <vector>
49 
50 namespace OpenMS
51 {
52 
61  class OPENMS_DLLAPI SpectraMerger :
62  public DefaultParamHandler
63  {
64 
65 protected:
66 
67  /* Determine distance between two spectra
68 
69  Distance is determined as
70 
71  (d_rt/rt_max_ + d_mz/mz_max_) / 2
72 
73  */
75  public DefaultParamHandler
76  {
77 public:
79  DefaultParamHandler("SpectraDistance")
80  {
81  defaults_.setValue("rt_tolerance", 10.0, "Maximal RT distance (in [s]) for two spectra's precursors.");
82  defaults_.setValue("mz_tolerance", 1.0, "Maximal m/z distance (in Da) for two spectra's precursors.");
83  defaultsToParam_();
84  }
85 
87  {
88  rt_max_ = (double) param_.getValue("rt_tolerance");
89  mz_max_ = (double) param_.getValue("mz_tolerance");
90 
91  return;
92  }
93 
94  double getSimilarity(const double d_rt, const double d_mz) const
95  {
96  // 1 - distance
97  return 1 - ((d_rt / rt_max_ + d_mz / mz_max_) / 2);
98  }
99 
100  // measure of SIMILARITY (not distance, i.e. 1-distance)!!
101  double operator()(const BaseFeature & first, const BaseFeature & second) const
102  {
103  // get RT distance:
104  double d_rt = fabs(first.getRT() - second.getRT());
105  double d_mz = fabs(first.getMZ() - second.getMZ());
106 
107  if (d_rt > rt_max_ || d_mz > mz_max_) {return 0; }
108 
109  // calculate similarity (0-1):
110  double sim = getSimilarity(d_rt, d_mz);
111 
112  return sim;
113  }
114 
115 protected:
116  double rt_max_;
117  double mz_max_;
118 
119  }; // end of SpectraDistance
120 
121 public:
122 
125 
126  // @name Constructors and Destructors
127  // @{
129  SpectraMerger();
130 
132  SpectraMerger(const SpectraMerger & source);
133 
135  virtual ~SpectraMerger();
136  // @}
137 
138  // @name Operators
139  // @{
141  SpectraMerger & operator=(const SpectraMerger & source);
142  // @}
143 
144  // @name Merging functions
145  // @{
147  template <typename MapType>
149  {
150  IntList ms_levels = param_.getValue("block_method:ms_levels");
151  Int rt_block_size(param_.getValue("block_method:rt_block_size"));
152  double rt_max_length = (param_.getValue("block_method:rt_max_length"));
153 
154  if (rt_max_length == 0) // no rt restriction set?
155  {
156  rt_max_length = (std::numeric_limits<double>::max)(); // set max rt span to very large value
157  }
158 
159  for (IntList::iterator it_mslevel = ms_levels.begin(); it_mslevel < ms_levels.end(); ++it_mslevel)
160  {
161  MergeBlocks spectra_to_merge;
162  Size idx_block(0);
163  SignedSize block_size_count(rt_block_size + 1);
164  Size idx_spectrum(0);
165  for (typename MapType::const_iterator it1 = exp.begin(); it1 != exp.end(); ++it1)
166  {
167  if (Int(it1->getMSLevel()) == *it_mslevel)
168  {
169  // block full if it contains a maximum number of scans or if maximum rt length spanned
170  if (++block_size_count >= rt_block_size ||
171  exp[idx_spectrum].getRT() - exp[idx_block].getRT() > rt_max_length)
172  {
173  block_size_count = 0;
174  idx_block = idx_spectrum;
175  }
176  else
177  {
178  spectra_to_merge[idx_block].push_back(idx_spectrum);
179  }
180  }
181 
182  ++idx_spectrum;
183  }
184  // check if last block had sacrifice spectra
185  if (block_size_count == 0) //block just got initialized
186  {
187  spectra_to_merge[idx_block] = std::vector<Size>();
188  }
189 
190  // merge spectra, remove all old MS spectra and add new consensus spectra
191  mergeSpectra_(exp, spectra_to_merge, *it_mslevel);
192  }
193 
194  exp.sortSpectra();
195 
196  return;
197  }
198 
200  template <typename MapType>
202  {
203 
204  // convert spectra's precursors to clusterizable data
205  Size data_size;
206  std::vector<BinaryTreeNode> tree;
207  Map<Size, Size> index_mapping;
208  // local scope to save memory - we do not need the clustering stuff later
209  {
210  std::vector<BaseFeature> data;
211 
212  for (Size i = 0; i < exp.size(); ++i)
213  {
214  if (exp[i].getMSLevel() != 2) continue;
215 
216  // remember which index in distance data ==> experiment index
217  index_mapping[data.size()] = i;
218 
219  // make cluster element
220  BaseFeature bf;
221  bf.setRT(exp[i].getRT());
222  std::vector<Precursor> pcs = exp[i].getPrecursors();
223  if (pcs.empty()) throw Exception::MissingInformation(__FILE__, __LINE__, __PRETTY_FUNCTION__, String("Scan #") + String(i) + " does not contain any precursor information! Unable to cluster!");
224  if (pcs.size() > 1) LOG_WARN << "More than one precursor found. Using first one!" << std::endl;
225  bf.setMZ(pcs[0].getMZ());
226  data.push_back(bf);
227  }
228  data_size = data.size();
229 
230  SpectraDistance_ llc;
231  llc.setParameters(param_.copy("precursor_method:", true));
232  SingleLinkage sl;
233  DistanceMatrix<float> dist; // will be filled
235 
236  //ch.setThreshold(0.99);
237  // clustering ; threshold is implicitly at 1.0, i.e. distances of 1.0 (== similarity 0) will not be clustered
238  ch.cluster<BaseFeature, SpectraDistance_>(data, llc, sl, tree, dist);
239  }
240 
241  // extract the clusters
242  ClusterAnalyzer ca;
243  std::vector<std::vector<Size> > clusters;
244  // count number of real tree nodes (not the -1 ones):
245  Size node_count = 0;
246  for (Size ii = 0; ii < tree.size(); ++ii)
247  {
248  if (tree[ii].distance >= 1) tree[ii].distance = -1; // manually set to disconnect, as SingleLinkage does not support it
249  if (tree[ii].distance != -1) ++node_count;
250  }
251  ca.cut(data_size - node_count, tree, clusters);
252 
253  //std::cerr << "Treesize: " << (tree.size()+1) << " #clusters: " << clusters.size() << std::endl;
254  //std::cerr << "tree:\n" << ca.newickTree(tree, true) << "\n";
255 
256  // convert to blocks
257  MergeBlocks spectra_to_merge;
258 
259  for (Size i_outer = 0; i_outer < clusters.size(); ++i_outer)
260  {
261  if (clusters[i_outer].size() <= 1) continue;
262  // init block with first cluster element
263  Size cl_index0 = clusters[i_outer][0];
264  spectra_to_merge[index_mapping[cl_index0]] = std::vector<Size>();
265  // add all other elements
266  for (Size i_inner = 1; i_inner < clusters[i_outer].size(); ++i_inner)
267  {
268  Size cl_index = clusters[i_outer][i_inner];
269  spectra_to_merge[index_mapping[cl_index0]].push_back(index_mapping[cl_index]);
270  }
271  }
272 
273  // do it
274  mergeSpectra_(exp, spectra_to_merge, 2);
275 
276  exp.sortSpectra();
277 
278  return;
279  }
280 
281  // @}
282 
283 protected:
284 
295  template <typename MapType>
296  void mergeSpectra_(MapType & exp, const MergeBlocks & spectra_to_merge, const UInt ms_level)
297  {
298  double mz_binning_width(param_.getValue("mz_binning_width"));
299  String mz_binning_unit(param_.getValue("mz_binning_width_unit"));
300 
301  // merge spectra
302  MapType merged_spectra;
303 
304  Map<Size, Size> cluster_sizes;
305  std::set<Size> merged_indices;
306 
307  // set up alignment
308  SpectrumAlignment sas;
309  Param p;
310  p.setValue("tolerance", mz_binning_width);
311  if (!(mz_binning_unit == "Da" || mz_binning_unit == "ppm")) throw Exception::IllegalSelfOperation(__FILE__, __LINE__, __PRETTY_FUNCTION__); // sanity check
312  // TODO : SpectrumAlignment does not implement is_relative_tolerance
313  p.setValue("is_relative_tolerance", mz_binning_unit == "Da" ? "false" : "true");
314  sas.setParameters(p);
315  std::vector<std::pair<Size, Size> > alignment;
316 
317  Size count_peaks_aligned(0);
318  Size count_peaks_overall(0);
319 
320  // each BLOCK
321  for (Map<Size, std::vector<Size> >::ConstIterator it = spectra_to_merge.begin(); it != spectra_to_merge.end(); ++it)
322  {
323 
324  ++cluster_sizes[it->second.size() + 1]; // for stats
325 
326  typename MapType::SpectrumType consensus_spec = exp[it->first];
327  consensus_spec.setMSLevel(ms_level);
328 
329  //consensus_spec.unify(exp[it->first]); // append meta info
330  merged_indices.insert(it->first);
331 
332  //typename MapType::SpectrumType all_peaks = exp[it->first];
333  double rt_average = consensus_spec.getRT();
334  double precursor_mz_average = 0.0;
335  Size precursor_count(0);
336  if (!consensus_spec.getPrecursors().empty())
337  {
338  precursor_mz_average = consensus_spec.getPrecursors()[0].getMZ();
339  ++precursor_count;
340  }
341 
342  count_peaks_overall += consensus_spec.size();
343 
344  // block elements
345  for (std::vector<Size>::const_iterator sit = it->second.begin(); sit != it->second.end(); ++sit)
346  {
347  consensus_spec.unify(exp[*sit]); // append meta info
348  merged_indices.insert(*sit);
349 
350  rt_average += exp[*sit].getRT();
351  if (ms_level >= 2 && exp[*sit].getPrecursors().size() > 0)
352  {
353  precursor_mz_average += exp[*sit].getPrecursors()[0].getMZ();
354  ++precursor_count;
355  }
356 
357  // merge data points
358  sas.getSpectrumAlignment(alignment, consensus_spec, exp[*sit]);
359  //std::cerr << "alignment of " << it->first << " with " << *sit << " yielded " << alignment.size() << " common peaks!\n";
360  count_peaks_aligned += alignment.size();
361  count_peaks_overall += exp[*sit].size();
362 
363  Size align_index(0);
364  Size spec_b_index(0);
365 
366  // sanity check for number of peaks
367  Size spec_a = consensus_spec.size(), spec_b = exp[*sit].size(), align_size = alignment.size();
368  for (typename MapType::SpectrumType::ConstIterator pit = exp[*sit].begin(); pit != exp[*sit].end(); ++pit)
369  {
370  // either add aligned peak height to existing peak
371  if (alignment.size() > 0 && alignment[align_index].second == spec_b_index)
372  {
373  consensus_spec[alignment[align_index].first].setIntensity(consensus_spec[alignment[align_index].first].getIntensity() +
374  pit->getIntensity());
375  ++align_index; // this aligned peak was explained, wait for next aligned peak ...
376  if (align_index == alignment.size()) alignment.clear(); // end reached -> avoid going into this block again
377  }
378  else // ... or add unaligned peak
379  {
380  consensus_spec.push_back(*pit);
381  }
382  ++spec_b_index;
383  }
384  consensus_spec.sortByPosition(); // sort, otherwise next alignment will fail
385  if (spec_a + spec_b - align_size != consensus_spec.size()) std::cerr << "\n\n ERRROR \n\n";
386  }
387  rt_average /= it->second.size() + 1;
388  consensus_spec.setRT(rt_average);
389 
390  if (ms_level >= 2)
391  {
392  if (precursor_count) precursor_mz_average /= precursor_count;
393  std::vector<Precursor> pcs = consensus_spec.getPrecursors();
394  //if (pcs.size()>1) LOG_WARN << "Removing excessive precursors - leaving only one per MS2 spectrum.\n";
395  pcs.resize(1);
396  pcs[0].setMZ(precursor_mz_average);
397  consensus_spec.setPrecursors(pcs);
398  }
399 
400  if (consensus_spec.empty()) continue;
401  else merged_spectra.addSpectrum(consensus_spec);
402  }
403 
404  LOG_INFO << "Cluster sizes:\n";
405  for (Map<Size, Size>::const_iterator it = cluster_sizes.begin(); it != cluster_sizes.end(); ++it)
406  {
407  LOG_INFO << " size " << it->first << ": " << it->second << "x\n";
408  }
409 
410  char buffer[200];
411  sprintf(buffer, "%d/%d (%.2f %%) of blocked spectra", (int)count_peaks_aligned,
412  (int)count_peaks_overall, float(count_peaks_aligned) / float(count_peaks_overall) * 100.);
413  LOG_INFO << "Number of merged peaks: " << String(buffer) << "\n";
414 
415  // remove all spectra that were within a cluster
416  typename MapType::SpectrumType empty_spec;
417  MapType exp_tmp;
418  for (Size i = 0; i < exp.size(); ++i)
419  {
420  if (merged_indices.count(i) == 0) // save unclustered ones
421  {
422  exp_tmp.addSpectrum(exp[i]);
423  exp[i] = empty_spec;
424  }
425  }
426 
427  //typedef std::vector<typename MapType::SpectrumType> Base;
428  //exp.Base::operator=(exp_tmp);
429  exp.clear(false);
430  exp.getSpectra().insert(exp.end(), exp_tmp.begin(), exp_tmp.end());
431 
432  // exp.erase(remove_if(exp.begin(), exp.end(), InMSLevelRange<typename MapType::SpectrumType>(ListUtils::create<int>(String(ms_level)), false)), exp.end());
433 
434  // ... and add consensus spectra
435  exp.getSpectra().insert(exp.end(), merged_spectra.begin(), merged_spectra.end());
436 
437  }
438 
439  };
440 
441 }
442 #endif //OPENMS_FILTERING_TRANSFORMERS_SPECTRAMERGER_H
void setValue(const String &key, const DataValue &value, const String &description="", const StringList &tags=StringList())
Sets a value.
A more convenient string class.
Definition: String.h:57
Size size() const
Definition: MSExperiment.h:117
void setMZ(CoordinateType coordinate)
Mutable access to the m/z coordinate (index 1)
Definition: Peak2D.h:197
#define LOG_INFO
Macro if a information, e.g. a status should be reported.
Definition: LogStream.h:455
void sortByPosition()
Lexicographically sorts the peaks by their position.
Definition: MSSpectrum.h:419
Bundles analyzing tools for a clustering (given as sequence of BinaryTreeNode's)
Definition: ClusterAnalyzer.h:52
void mergeSpectra_(MapType &exp, const MergeBlocks &spectra_to_merge, const UInt ms_level)
merges blocks of spectra of a certain level
Definition: SpectraMerger.h:296
A two-dimensional distance matrix, similar to OpenMS::Matrix.
Definition: DistanceMatrix.h:68
void mergeSpectraBlockWise(MapType &exp)
Definition: SpectraMerger.h:148
Iterator begin()
Definition: MSExperiment.h:147
std::vector< Int > IntList
Vector of signed integers.
Definition: ListUtils.h:59
Merges blocks of MS or MS2 spectra.
Definition: SpectraMerger.h:61
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:128
void sortSpectra(bool sort_mz=true)
Sorts the data points by retention time.
Definition: MSExperiment.h:600
Base::const_iterator const_iterator
Definition: MSExperiment.h:115
CoordinateType getRT() const
Returns the RT coordinate (index 0)
Definition: Peak2D.h:203
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
void setParameters(const Param &param)
Sets the parameters.
SpectraDistance_()
Definition: SpectraMerger.h:78
A basic LC-MS feature.
Definition: BaseFeature.h:56
#define LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged...
Definition: LogStream.h:451
Iterator end()
Definition: MSExperiment.h:157
void setRT(CoordinateType coordinate)
Mutable access to the RT coordinate (index 0)
Definition: Peak2D.h:209
double rt_max_
Definition: SpectraMerger.h:116
double getRT() const
Definition: MSSpectrum.h:243
void setMSLevel(UInt ms_level)
Sets the MS level.
Definition: MSSpectrum.h:265
Definition: SpectraMerger.h:74
const std::vector< MSSpectrum< PeakT > > & getSpectra() const
returns the spectra list
Definition: MSExperiment.h:764
Aligns the peaks of two spectra.
Definition: SpectrumAlignment.h:62
void addSpectrum(const MSSpectrum< PeakT > &spectrum)
adds a spectra to the list
Definition: MSExperiment.h:758
void setRT(double rt)
Sets the absolute retention time (is seconds)
Definition: MSSpectrum.h:249
Management and storage of parameters / INI files.
Definition: Param.h:75
Map< Size, std::vector< Size > > MergeBlocks
blocks of spectra (master-spectrum index to sacrifice-spectra(the ones being merged into the master-s...
Definition: SpectraMerger.h:124
CoordinateType getMZ() const
Returns the m/z coordinate (index 1)
Definition: Peak2D.h:191
SingleLinkage ClusterMethod.
Definition: SingleLinkage.h:58
void unify(const SpectrumSettings &rhs)
merge another spectrum setting into this one (data is usually appended, except for spectrum type whic...
Illegal self operation exception.
Definition: Exception.h:379
std::vector< SpectrumType >::const_iterator ConstIterator
Non-mutable iterator.
Definition: MSExperiment.h:103
void setPrecursors(const std::vector< Precursor > &precursors)
sets the precursors
double operator()(const BaseFeature &first, const BaseFeature &second) const
Definition: SpectraMerger.h:101
double mz_max_
Definition: SpectraMerger.h:117
double getSimilarity(const double d_rt, const double d_mz) const
Definition: SpectraMerger.h:94
void clear(bool clear_meta_data)
Clears all data and meta data.
Definition: MSExperiment.h:850
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:92
void cut(const Size cluster_quantity, const std::vector< BinaryTreeNode > &tree, std::vector< std::vector< Size > > &clusters)
Method to calculate a partition resulting from a certain step in clustering given by the number of cl...
void mergeSpectraPrecursors(MapType &exp)
merges spectra with similar precursors (must have MS2 level)
Definition: SpectraMerger.h:201
const std::vector< Precursor > & getPrecursors() const
returns a const reference to the precursors
Hierarchical clustering with generic clustering functions.
Definition: ClusterHierarchical.h:64
void getSpectrumAlignment(std::vector< std::pair< Size, Size > > &alignment, const SpectrumType &s1, const SpectrumType &s2) const
Definition: SpectrumAlignment.h:83
int Int
Signed integer type.
Definition: Types.h:96
Map class based on the STL map (containing several convenience functions)
Definition: Map.h:51
void updateMembers_()
This method is used to update extra member variables at the end of the setParameters() method...
Definition: SpectraMerger.h:86
void cluster(std::vector< Data > &data, const SimilarityComparator &comparator, const ClusterFunctor &clusterer, std::vector< BinaryTreeNode > &cluster_tree, DistanceMatrix< float > &original_distance)
Clustering function.
Definition: ClusterHierarchical.h:108
Not all required information provided.
Definition: Exception.h:196

OpenMS / TOPP release 2.0.0 Documentation generated on Sat May 16 2015 16:13:34 using doxygen 1.8.9.1