IsoSpec  2.1.2
marginalTrek++.cpp
1 /*
2  * Copyright (C) 2015-2020 Mateusz Łącki and Michał Startek.
3  *
4  * This file is part of IsoSpec.
5  *
6  * IsoSpec is free software: you can redistribute it and/or modify
7  * it under the terms of the Simplified ("2-clause") BSD licence.
8  *
9  * IsoSpec is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12  *
13  * You should have received a copy of the Simplified BSD Licence
14  * along with IsoSpec. If not, see <https://opensource.org/licenses/BSD-2-Clause>.
15  */
16 
17 
18 #include <cmath>
19 #include <algorithm>
20 #include <vector>
21 #include <cstdlib>
22 #include <queue>
23 #include <utility>
24 #include <cstring>
25 #include <string>
26 #include <limits>
27 #include <memory>
28 #include "platform.h"
29 #include "marginalTrek++.h"
30 #include "conf.h"
31 #include "allocator.h"
32 #include "operators.h"
33 #include "summator.h"
34 #include "element_tables.h"
35 #include "misc.h"
36 
37 
38 namespace IsoSpec
39 {
40 
42 
50 void writeInitialConfiguration(const int atomCnt, const int isotopeNo, const double* lprobs, int* res)
51 {
57  // This approximates the mode (heuristics: the mean is close to the mode).
58  for(int i = 0; i < isotopeNo; ++i)
59  res[i] = static_cast<int>( atomCnt * exp(lprobs[i]) ) + 1;
60 
61  // The number of assigned atoms above.
62  int s = 0;
63 
64  for(int i = 0; i < isotopeNo; ++i) s += res[i];
65 
66  int diff = atomCnt - s;
67 
68  // Too little: enlarging fist index.
69  if( diff > 0 ){
70  res[0] += diff;
71  }
72  // Too much: trying to redistribute the difference: hopefully the first element is the largest.
73  if( diff < 0 ){
74  diff = abs(diff);
75  int i = 0;
76 
77  while( diff > 0){
78  int coordDiff = res[i] - diff;
79 
80  if( coordDiff >= 0 ){
81  res[i] -= diff;
82  diff = 0;
83  } else {
84  res[i] = 0;
85  i++;
86  diff = abs(coordDiff);
87  }
88  }
89  }
90 
91  // What we computed so far will be very close to the mode: hillclimb the rest of the way
92 
93  bool modified = true;
94  double LP = unnormalized_logProb(res, lprobs, isotopeNo);
95  double NLP;
96 
97  while(modified)
98  {
99  modified = false;
100  for(int ii = 0; ii < isotopeNo; ii++)
101  for(int jj = 0; jj < isotopeNo; jj++)
102  if(ii != jj && res[ii] > 0)
103  {
104  res[ii]--;
105  res[jj]++;
106  NLP = unnormalized_logProb(res, lprobs, isotopeNo);
107  if(NLP > LP || (NLP == LP && ii > jj))
108  {
109  modified = true;
110  LP = NLP;
111  }
112  else
113  {
114  res[ii]++;
115  res[jj]--;
116  }
117  }
118  }
119 }
120 
121 
122 double* getMLogProbs(const double* probs, int isoNo)
123 {
129  for(int ii = 0; ii < isoNo; ii++)
130  if(probs[ii] <= 0.0 || probs[ii] > 1.0)
131  throw std::invalid_argument("All isotope probabilities p must fulfill: 0.0 < p <= 1.0");
132 
133  double* ret = new double[isoNo];
134 
135  // here we change the table of probabilities and log it.
136  for(int i = 0; i < isoNo; i++)
137  {
138  ret[i] = log(probs[i]);
139  for(int j = 0; j < ISOSPEC_NUMBER_OF_ISOTOPIC_ENTRIES; j++)
140  if(elem_table_probability[j] == probs[i])
141  {
142  ret[i] = elem_table_log_probability[j];
143  break;
144  }
145  }
146  return ret;
147 }
148 
149 double get_loggamma_nominator(int x)
150 {
151  // calculate log gamma of the nominator calculated in the binomial exression.
152  double ret = lgamma(x+1);
153  return ret;
154 }
155 
156 int verify_atom_cnt(int atomCnt)
157 {
158  #if !ISOSPEC_BUILDING_OPENMS
159  if(ISOSPEC_G_FACT_TABLE_SIZE-1 <= atomCnt)
160  throw std::length_error("Subisotopologue too large, size limit (that is, the maximum number of atoms of a single element in a molecule) is: " + std::to_string(ISOSPEC_G_FACT_TABLE_SIZE-1));
161  #endif
162  return atomCnt;
163 }
164 
166  const double* _masses,
167  const double* _probs,
168  int _isotopeNo,
169  int _atomCnt
170 ) :
171 disowned(false),
172 isotopeNo(_isotopeNo),
173 atomCnt(verify_atom_cnt(_atomCnt)),
174 atom_lProbs(getMLogProbs(_probs, isotopeNo)),
175 atom_masses(array_copy<double>(_masses, _isotopeNo)),
176 loggamma_nominator(get_loggamma_nominator(_atomCnt)),
177 mode_conf(nullptr)
178 // Deliberately not initializing mode_lprob
179 {}
180 
182 disowned(false),
183 isotopeNo(other.isotopeNo),
184 atomCnt(other.atomCnt),
185 atom_lProbs(array_copy<double>(other.atom_lProbs, isotopeNo)),
186 atom_masses(array_copy<double>(other.atom_masses, isotopeNo)),
187 loggamma_nominator(other.loggamma_nominator)
188 {
189  if(other.mode_conf == nullptr)
190  {
191  mode_conf = nullptr;
192  // Deliberately not initializing mode_lprob. In this state other.mode_lprob is uninitialized too.
193  }
194  else
195  {
196  mode_conf = array_copy<int>(other.mode_conf, isotopeNo);
197  mode_lprob = other.mode_lprob;
198  }
199 }
200 
201 
202 // the move-constructor: used in the specialization of the marginal.
204 disowned(other.disowned),
205 isotopeNo(other.isotopeNo),
206 atomCnt(other.atomCnt),
207 atom_lProbs(other.atom_lProbs),
208 atom_masses(other.atom_masses),
209 loggamma_nominator(other.loggamma_nominator)
210 {
211  other.disowned = true;
212  if(other.mode_conf == nullptr)
213  {
214  mode_conf = nullptr;
215  // Deliberately not initializing mode_lprob. In this state other.mode_lprob is uninitialized too.
216  }
217  else
218  {
219  mode_conf = other.mode_conf;
220  mode_lprob = other.mode_lprob;
221  }
222 }
223 
225 {
226  if(!disowned)
227  {
228  delete[] atom_masses;
229  delete[] atom_lProbs;
230  delete[] mode_conf;
231  }
232 }
233 
234 
236 {
237  Conf res = new int[isotopeNo];
239  return res;
240 }
241 
242 void Marginal::setupMode()
243 {
244  ISOSPEC_IMPOSSIBLE(mode_conf != nullptr);
246  mode_lprob = logProb(mode_conf);
247 }
248 
249 
251 {
252  double ret_mass = std::numeric_limits<double>::infinity();
253  for(unsigned int ii = 0; ii < isotopeNo; ii++)
254  if( ret_mass > atom_masses[ii] )
255  ret_mass = atom_masses[ii];
256  return ret_mass*atomCnt;
257 }
258 
260 {
261  double ret_mass = 0.0;
262  for(unsigned int ii = 0; ii < isotopeNo; ii++)
263  if( ret_mass < atom_masses[ii] )
264  ret_mass = atom_masses[ii];
265  return ret_mass*atomCnt;
266 }
267 
269 {
270  double found_prob = -std::numeric_limits<double>::infinity();
271  double found_mass = 0.0; // to avoid uninitialized var warning
272  for(unsigned int ii = 0; ii < isotopeNo; ii++)
273  if( found_prob < atom_lProbs[ii] )
274  {
275  found_prob = atom_lProbs[ii];
276  found_mass = atom_masses[ii];
277  }
278  return found_mass*atomCnt;
279 }
280 
282 {
283  double ret = 0.0;
284  for(unsigned int ii = 0; ii < isotopeNo; ii++)
285  ret += exp(atom_lProbs[ii]) * atom_masses[ii];
286  return ret;
287 }
288 
289 double Marginal::variance() const
290 {
291  double ret = 0.0;
292  double mean = getAtomAverageMass();
293  for(size_t ii = 0; ii < isotopeNo; ii++)
294  {
295  double msq = atom_masses[ii] - mean;
296  ret += exp(atom_lProbs[ii]) * msq * msq;
297  }
298  return ret * atomCnt;
299 }
300 
301 double Marginal::getLogSizeEstimate(double logEllipsoidRadius) const
302 {
303  if(isotopeNo <= 1)
304  return -std::numeric_limits<double>::infinity();
305 
306  const double i = static_cast<double>(isotopeNo);
307  const double k = i - 1.0;
308  const double n = static_cast<double>(atomCnt);
309 
310  double sum_lprobs = 0.0;
311  for(int jj = 0; jj < i; jj++)
312  sum_lprobs += atom_lProbs[jj];
313 
314  double log_V_simplex = k * log(n) - lgamma(i);
315  double log_N_simplex = lgamma(n+i) - lgamma(n+1.0) - lgamma(i);
316  double log_V_ellipsoid = (k * (log(n) + logpi + logEllipsoidRadius) + sum_lprobs) * 0.5 - lgamma((i+1)*0.5);
317 
318  return log_N_simplex + log_V_ellipsoid - log_V_simplex;
319 }
320 
321 
322 // this is roughly an equivalent of IsoSpec-Threshold-Generator
324  Marginal&& m,
325  int tabSize,
326  int
327 ) :
328 Marginal(std::move(m)),
329 current_count(0),
330 orderMarginal(atom_lProbs, isotopeNo),
331 pq(),
332 allocator(isotopeNo, tabSize)
333 {
334  int* initialConf = allocator.makeCopy(mode_conf);
335 
336  pq.push({unnormalized_logProb(mode_conf), initialConf});
337 
338  current_count = 0;
339 
340  add_next_conf();
341 }
342 
343 
344 bool MarginalTrek::add_next_conf()
345 {
350  if(pq.size() < 1) return false;
351 
352  double logprob = pq.top().first + loggamma_nominator;
353  Conf topConf = pq.top().second;
354 
355  pq.pop();
356  ++current_count;
357 
358  _confs.push_back(topConf);
359 
360  _conf_masses.push_back(calc_mass(topConf, atom_masses, isotopeNo));
361  _conf_lprobs.push_back(logprob);
362 
363  for( unsigned int j = 0; j < isotopeNo; ++j )
364  {
365  if( topConf[j] > mode_conf[j])
366  continue;
367 
368  if( topConf[j] > 0 )
369  {
370  for( unsigned int i = 0; i < isotopeNo; ++i )
371  {
372  if( topConf[i] < mode_conf[i] )
373  continue;
374  // Growing index different than decreasing one AND
375  // Remain on simplex condition.
376  if( i != j ){
377  Conf acceptedCandidate = allocator.makeCopy(topConf);
378 
379  ++acceptedCandidate[i];
380  --acceptedCandidate[j];
381 
382  double new_prob = unnormalized_logProb(acceptedCandidate);
383 
384  pq.push({new_prob, acceptedCandidate});
385  }
386 
387  if( topConf[i] > mode_conf[i] )
388  break;
389  }
390  }
391  if( topConf[j] < mode_conf[j] )
392  break;
393  }
394 
395  return true;
396 }
397 
398 
399 MarginalTrek::~MarginalTrek()
400 {
401 }
402 
403 
404 
406  double lCutOff,
407  bool sort,
408  int tabSize,
409  int
410 ) : Marginal(std::move(m)),
411 allocator(isotopeNo, tabSize)
412 {
413  Conf currentConf = allocator.makeCopy(mode_conf);
414  if(logProb(currentConf) >= lCutOff)
415  {
416  configurations.push_back(currentConf);
417  lProbs.push_back(mode_lprob);
418  }
419 
420  unsigned int idx = 0;
421 
422  std::unique_ptr<double[]> prob_partials(new double[isotopeNo]);
423  std::unique_ptr<double[]> prob_part_acc(new double[isotopeNo+1]);
424  prob_part_acc[0] = loggamma_nominator;
425 
426  while(idx < configurations.size())
427  {
428  currentConf = configurations[idx];
429  idx++;
430 
431  for(size_t ii = 0; ii < isotopeNo; ii++)
432  prob_partials[ii] = minuslogFactorial(currentConf[ii]) + currentConf[ii] * atom_lProbs[ii];
433 
434  for(unsigned int ii = 0; ii < isotopeNo; ii++ )
435  {
436  if(currentConf[ii] > mode_conf[ii])
437  continue;
438 
439  if(currentConf[ii] != 0)
440  {
441  double prev_partial_ii = prob_partials[ii];
442  currentConf[ii]--;
443  prob_partials[ii] = minuslogFactorial(currentConf[ii]) + currentConf[ii] * atom_lProbs[ii];
444 
445  for(unsigned int jj = 0; jj < isotopeNo; jj++ )
446  {
447  prob_part_acc[jj+1] = prob_part_acc[jj] + prob_partials[jj];
448 
449  if(currentConf[jj] < mode_conf[jj])
450  continue;
451 
452  if( ii != jj )
453  {
454  double logp = prob_part_acc[jj] + minuslogFactorial(1+currentConf[jj]) + (1+currentConf[jj]) * atom_lProbs[jj];
455  for(size_t kk = jj+1; kk < isotopeNo; kk++)
456  logp += prob_partials[kk];
457 
458  if (logp >= lCutOff)
459  {
460  auto tmp = allocator.makeCopy(currentConf);
461  tmp[jj]++;
462  configurations.push_back(tmp);
463  lProbs.push_back(logp);
464  }
465  }
466  else
467  prob_part_acc[jj+1] = prob_part_acc[jj] + prob_partials[jj];
468 
469  if (currentConf[jj] > mode_conf[jj])
470  break;
471  }
472  currentConf[ii]++;
473  prob_partials[ii] = prev_partial_ii;
474  }
475 
476  if(currentConf[ii] < mode_conf[ii])
477  break;
478  }
479  }
480 
481  no_confs = configurations.size();
482  confs = configurations.data();
483 
484  if(sort && no_confs > 0)
485  {
486  std::unique_ptr<size_t[]> order_arr(get_inverse_order(lProbs.data(), no_confs));
487  impose_order(order_arr.get(), no_confs, lProbs.data(), confs);
488  }
489 
490  probs = new double[no_confs];
491  masses = new double[no_confs];
492 
493 
494  for(unsigned int ii = 0; ii < no_confs; ii++)
495  {
496  probs[ii] = exp(lProbs[ii]);
497  masses[ii] = calc_mass(confs[ii], atom_masses, isotopeNo);
498  }
499 
500  lProbs.push_back(-std::numeric_limits<double>::infinity());
501 }
502 
503 
505 {
506  if(masses != nullptr)
507  delete[] masses;
508  if(probs != nullptr)
509  delete[] probs;
510 }
511 
512 
513 
514 
515 
516 
517 
519 : Marginal(std::move(m)), current_threshold(1.0), allocator(isotopeNo, tabSize),
520 equalizer(isotopeNo), keyHasher(isotopeNo)
521 {
522  fringe.push_back(mode_conf);
523  lProbs.push_back(std::numeric_limits<double>::infinity());
524  fringe_unn_lprobs.push_back(unnormalized_logProb(mode_conf));
525  lProbs.push_back(-std::numeric_limits<double>::infinity());
526  guarded_lProbs = lProbs.data()+1;
527 }
528 
529 bool LayeredMarginal::extend(double new_threshold, bool do_sort)
530 {
531  new_threshold -= loggamma_nominator;
532  if(fringe.empty())
533  return false;
534 
535  lProbs.pop_back(); // Remove the +inf guardian
536 
537  pod_vector<Conf> new_fringe;
538  pod_vector<double> new_fringe_unn_lprobs;
539 
540  while(!fringe.empty())
541  {
542  Conf currentConf = fringe.back();
543  fringe.pop_back();
544 
545  double opc = fringe_unn_lprobs.back();
546 
547  fringe_unn_lprobs.pop_back();
548  if(opc < new_threshold)
549  {
550  new_fringe.push_back(currentConf);
551  new_fringe_unn_lprobs.push_back(opc);
552  }
553 
554  else
555  {
556  configurations.push_back(currentConf);
557  lProbs.push_back(opc+loggamma_nominator);
558  for(unsigned int ii = 0; ii < isotopeNo; ii++ )
559  {
560  if(currentConf[ii] > mode_conf[ii])
561  continue;
562 
563  if(currentConf[ii] > 0)
564  {
565  currentConf[ii]--;
566  for(unsigned int jj = 0; jj < isotopeNo; jj++ )
567  {
568  if(currentConf[jj] < mode_conf[jj])
569  continue;
570 
571  if( ii != jj )
572  {
573  Conf nc = allocator.makeCopy(currentConf);
574  nc[jj]++;
575 
576  double lpc = unnormalized_logProb(nc);
577  if(lpc >= new_threshold)
578  {
579  fringe.push_back(nc);
580  fringe_unn_lprobs.push_back(lpc);
581  }
582  else
583  {
584  new_fringe.push_back(nc);
585  new_fringe_unn_lprobs.push_back(lpc);
586  }
587  }
588 
589  if(currentConf[jj] > mode_conf[jj])
590  break;
591  }
592  currentConf[ii]++;
593  }
594 
595  if(currentConf[ii] < mode_conf[ii])
596  break;
597  }
598  }
599  }
600 
601  current_threshold = new_threshold;
602  fringe.swap(new_fringe);
603  fringe_unn_lprobs.swap(new_fringe_unn_lprobs);
604 
605  if(do_sort)
606  {
607  size_t to_sort_size = configurations.size() - probs.size();
608  if(to_sort_size > 0)
609  {
610  std::unique_ptr<size_t[]> order_arr(get_inverse_order(lProbs.data()+1+probs.size(), to_sort_size));
611  double* P = lProbs.data()+1+probs.size();
612  Conf* C = configurations.data()+probs.size();
613  size_t* O = order_arr.get();
614  impose_order(O, to_sort_size, P, C);
615  }
616  }
617 
618  if(probs.capacity() * 2 < configurations.size() + 2)
619  {
620  // Reserve space for new values
621  probs.reserve(configurations.size());
622  masses.reserve(configurations.size());
623  } // Otherwise we're growing slowly enough that standard reallocations on push_back work better - we waste some extra memory
624  // but don't reallocate on every call
625 
626 // printVector(lProbs);
627  for(unsigned int ii = probs.size(); ii < configurations.size(); ii++)
628  {
629  probs.push_back(exp(lProbs[ii+1]));
630  masses.push_back(calc_mass(configurations[ii], atom_masses, isotopeNo));
631  }
632 
633  lProbs.push_back(-std::numeric_limits<double>::infinity()); // Restore guardian
634 
635  guarded_lProbs = lProbs.data()+1; // Vector might have reallocated its backing storage
636 
637  return true;
638 }
639 
640 
642 {
643  double ret = std::numeric_limits<double>::infinity();
644  for(pod_vector<double>::const_iterator it = masses.cbegin(); it != masses.cend(); ++it)
645  if(*it < ret)
646  ret = *it;
647  return ret;
648 }
649 
650 
652 {
653  double ret = -std::numeric_limits<double>::infinity();
654  for(pod_vector<double>::const_iterator it = masses.cbegin(); it != masses.cend(); ++it)
655  if(*it > ret)
656  ret = *it;
657  return ret;
658 }
659 
660 } // namespace IsoSpec
IsoSpec::Marginal::getMonoisotopicConfMass
double getMonoisotopicConfMass() const
Get the mass of the monoisotopic subisotopologue.
Definition: marginalTrek++.cpp:268
IsoSpec::PrecalculatedMarginal::PrecalculatedMarginal
PrecalculatedMarginal(Marginal &&m, double lCutOff, bool sort=true, int tabSize=1000, int hashSize=1000)
The move constructor (disowns the Marginal).
Definition: marginalTrek++.cpp:405
IsoSpec::Marginal::variance
double variance() const
Calculate the variance of the theoretical distribution describing the subisotopologue.
Definition: marginalTrek++.cpp:289
IsoSpec::Marginal::Marginal
Marginal(const double *_masses, const double *_probs, int _isotopeNo, int _atomCnt)
Class constructor.
Definition: marginalTrek++.cpp:165
pod_vector< Conf >
IsoSpec::Marginal
The marginal distribution class (a subisotopologue).
Definition: marginalTrek++.h:42
IsoSpec::Marginal::unnormalized_logProb
ISOSPEC_FORCE_INLINE double unnormalized_logProb(Conf conf) const
Calculate the log-probability of a given subisotopologue.
Definition: marginalTrek++.h:159
IsoSpec::LayeredMarginal::LayeredMarginal
LayeredMarginal(Marginal &&m, int tabSize=1000, int hashSize=1000)
Move constructor: specializes the Marginal class.
Definition: marginalTrek++.cpp:518
IsoSpec::Marginal::getHeaviestConfMass
double getHeaviestConfMass() const
Get the mass of the heaviest subisotopologue.
Definition: marginalTrek++.cpp:259
IsoSpec
Definition: allocator.cpp:20
IsoSpec::PrecalculatedMarginal::~PrecalculatedMarginal
virtual ~PrecalculatedMarginal()
Destructor.
Definition: marginalTrek++.cpp:504
IsoSpec::Marginal::getLightestConfMass
double getLightestConfMass() const
Get the mass of the lightest subisotopologue.
Definition: marginalTrek++.cpp:250
IsoSpec::Marginal::mode_lprob
double mode_lprob
Definition: marginalTrek++.h:53
IsoSpec::getMLogProbs
double * getMLogProbs(const double *probs, int isoNo)
Definition: marginalTrek++.cpp:122
IsoSpec::Marginal::mode_conf
Conf mode_conf
Definition: marginalTrek++.h:52
IsoSpec::Marginal::atom_masses
const double *const atom_masses
Definition: marginalTrek++.h:50
IsoSpec::writeInitialConfiguration
void writeInitialConfiguration(const int atomCnt, const int isotopeNo, const double *lprobs, int *res)
Find one of the most probable subisotopologues.
Definition: marginalTrek++.cpp:50
IsoSpec::Marginal::computeModeConf
Conf computeModeConf() const
The the probability of the mode subisotopologue.
Definition: marginalTrek++.cpp:235
IsoSpec::Marginal::atomCnt
const unsigned int atomCnt
Definition: marginalTrek++.h:48
IsoSpec::Marginal::getAtomAverageMass
double getAtomAverageMass() const
The average mass of a single atom.
Definition: marginalTrek++.cpp:281
IsoSpec::Marginal::getLogSizeEstimate
double getLogSizeEstimate(double logEllipsoidRadius) const
Return estimated logarithm of size of the marginal at a given ellipsoid radius.
Definition: marginalTrek++.cpp:301
IsoSpec::LayeredMarginal::get_min_mass
double get_min_mass() const
Get the minimal mass in current layer.
Definition: marginalTrek++.cpp:641
IsoSpec::Marginal::atom_lProbs
const double *const atom_lProbs
Definition: marginalTrek++.h:49
IsoSpec::Marginal::~Marginal
virtual ~Marginal()
Destructor.
Definition: marginalTrek++.cpp:224
IsoSpec::LayeredMarginal::extend
bool extend(double new_threshold, bool do_sort=true)
Extend the set of computed subisotopologues to those above the new threshold.
Definition: marginalTrek++.cpp:529
IsoSpec::MarginalTrek::MarginalTrek
MarginalTrek(Marginal &&m, int tabSize=1000, int hashSize=1000)
Move constructor: specializes the Marginal class.
Definition: marginalTrek++.cpp:323
IsoSpec::Marginal::isotopeNo
const unsigned int isotopeNo
Definition: marginalTrek++.h:47
IsoSpec::Marginal::loggamma_nominator
const double loggamma_nominator
Definition: marginalTrek++.h:51
IsoSpec::LayeredMarginal::get_max_mass
double get_max_mass() const
Get the maximal mass in current layer.
Definition: marginalTrek++.cpp:651