IsoSpec  2.1.2
fixedEnvelopes.cpp
1 /*
2  * Copyright (C) 2015-2020 Mateusz Łącki and Michał Startek.
3  *
4  * This file is part of IsoSpec.
5  *
6  * IsoSpec is free software: you can redistribute it and/or modify
7  * it under the terms of the Simplified ("2-clause") BSD licence.
8  *
9  * IsoSpec is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12  *
13  * You should have received a copy of the Simplified BSD Licence
14  * along with IsoSpec. If not, see <https://opensource.org/licenses/BSD-2-Clause>.
15  */
16 
17 #include "fixedEnvelopes.h"
18 #include <limits>
19 #include "isoMath.h"
20 
21 namespace IsoSpec
22 {
23 
24 FixedEnvelope::FixedEnvelope(const FixedEnvelope& other) :
25 _masses(array_copy<double>(other._masses, other._confs_no)),
26 _probs(array_copy<double>(other._probs, other._confs_no)),
27 _confs(array_copy<int>(other._confs, other._confs_no*other.allDim)),
28 _confs_no(other._confs_no),
29 allDim(other.allDim),
30 sorted_by_mass(other.sorted_by_mass),
31 sorted_by_prob(other.sorted_by_prob),
32 total_prob(other.total_prob)
33 {}
34 
35 FixedEnvelope::FixedEnvelope(FixedEnvelope&& other) :
36 _masses(other._masses),
37 _probs(other._probs),
38 _confs(other._confs),
39 _confs_no(other._confs_no),
40 allDim(other.allDim),
41 sorted_by_mass(other.sorted_by_mass),
42 sorted_by_prob(other.sorted_by_prob),
43 total_prob(other.total_prob)
44 {
45 other._masses = nullptr;
46 other._probs = nullptr;
47 other._confs = nullptr;
48 other._confs_no = 0;
49 other.total_prob = 0.0;
50 }
51 
52 FixedEnvelope::FixedEnvelope(double* in_masses, double* in_probs, size_t in_confs_no, bool masses_sorted, bool probs_sorted, double _total_prob) :
53 _masses(in_masses),
54 _probs(in_probs),
55 _confs(nullptr),
56 _confs_no(in_confs_no),
57 allDim(0),
58 sorted_by_mass(masses_sorted),
59 sorted_by_prob(probs_sorted),
60 total_prob(_total_prob)
61 {}
62 
63 FixedEnvelope FixedEnvelope::operator+(const FixedEnvelope& other) const
64 {
65  double* nprobs = reinterpret_cast<double*>(malloc(sizeof(double) * (_confs_no+other._confs_no)));
66  if(nprobs == nullptr)
67  throw std::bad_alloc();
68  double* nmasses = reinterpret_cast<double*>(malloc(sizeof(double) * (_confs_no+other._confs_no)));
69  if(nmasses == nullptr)
70  {
71  free(nprobs);
72  throw std::bad_alloc();
73  }
74 
75  memcpy(nprobs, _probs, sizeof(double) * _confs_no);
76  memcpy(nmasses, _masses, sizeof(double) * _confs_no);
77 
78  memcpy(nprobs+_confs_no, other._probs, sizeof(double) * other._confs_no);
79  memcpy(nmasses+_confs_no, other._masses, sizeof(double) * other._confs_no);
80 
81  return FixedEnvelope(nmasses, nprobs, _confs_no + other._confs_no);
82 }
83 
84 FixedEnvelope FixedEnvelope::operator*(const FixedEnvelope& other) const
85 {
86  double* nprobs = reinterpret_cast<double*>(malloc(sizeof(double) * _confs_no * other._confs_no));
87  if(nprobs == nullptr)
88  throw std::bad_alloc();
89  // deepcode ignore CMemoryLeak: It's not a memleak: the memory is passed to FixedEnvelope which
90  // deepcode ignore CMemoryLeak: takes ownership of it, and will properly free() it in destructor.
91  double* nmasses = reinterpret_cast<double*>(malloc(sizeof(double) * _confs_no * other._confs_no));
92  if(nmasses == nullptr)
93  {
94  free(nprobs);
95  throw std::bad_alloc();
96  }
97 
98  size_t tgt_idx = 0;
99 
100  for(size_t ii = 0; ii < _confs_no; ii++)
101  for(size_t jj = 0; jj < other._confs_no; jj++)
102  {
103  nprobs[tgt_idx] = _probs[ii] * other._probs[jj];
104  nmasses[tgt_idx] = _masses[ii] + other._masses[jj];
105  tgt_idx++;
106  }
107 
108  return FixedEnvelope(nmasses, nprobs, tgt_idx);
109 }
110 
111 void FixedEnvelope::sort_by_mass()
112 {
113  if(sorted_by_mass)
114  return;
115 
116  sort_by(_masses);
117 
118  sorted_by_mass = true;
119  sorted_by_prob = false;
120 }
121 
122 
123 void FixedEnvelope::sort_by_prob()
124 {
125  if(sorted_by_prob)
126  return;
127 
128  sort_by(_probs);
129 
130  sorted_by_prob = true;
131  sorted_by_mass = false;
132 }
133 
134 template<typename T> void reorder_array(T* arr, size_t* order, size_t size, bool can_destroy = false)
135 {
136  if(!can_destroy)
137  {
138  size_t* order_c = new size_t[size];
139  memcpy(order_c, order, sizeof(size_t)*size);
140  order = order_c;
141  }
142 
143  for(size_t ii = 0; ii < size; ii++)
144  while(order[ii] != ii)
145  {
146  std::swap(arr[ii], arr[order[ii]]);
147  std::swap(order[order[ii]], order[ii]);
148  }
149 
150  if(!can_destroy)
151  delete[] order;
152 }
153 
154 void FixedEnvelope::sort_by(double* order)
155 {
156  size_t* indices = new size_t[_confs_no];
157 
158  for(size_t ii = 0; ii < _confs_no; ii++)
159  indices[ii] = ii;
160 
161  std::sort<size_t*>(indices, indices + _confs_no, TableOrder<double>(order));
162 
163  size_t* inverse = new size_t[_confs_no];
164 
165  for(size_t ii = 0; ii < _confs_no; ii++)
166  inverse[indices[ii]] = ii;
167 
168  delete[] indices;
169 
170  reorder_array(_masses, inverse, _confs_no);
171  reorder_array(_probs, inverse, _confs_no);
172  if(_confs != nullptr)
173  {
174  int* swapspace = new int[allDim];
175  for(size_t ii = 0; ii < _confs_no; ii++)
176  while(order[ii] != ii)
177  {
178  memcpy(swapspace, &_confs[ii*allDim], allDimSizeofInt);
179  memcpy(&_confs[ii*allDim], &_confs[inverse[ii]*allDim], allDimSizeofInt);
180  memcpy(&_confs[inverse[ii]*allDim], swapspace, allDimSizeofInt);
181  }
182  delete[] swapspace;
183  }
184  delete[] inverse;
185 }
186 
187 
188 double FixedEnvelope::get_total_prob()
189 {
190  if(std::isnan(total_prob))
191  {
192  total_prob = 0.0;
193  for(size_t ii = 0; ii < _confs_no; ii++)
194  total_prob += _probs[ii];
195  }
196  return total_prob;
197 }
198 
199 void FixedEnvelope::scale(double factor)
200 {
201  for(size_t ii = 0; ii < _confs_no; ii++)
202  _probs[ii] *= factor;
203  total_prob *= factor;
204 }
205 
206 void FixedEnvelope::normalize()
207 {
208  double tp = get_total_prob();
209  if(tp != 1.0)
210  {
211  scale(1.0/tp);
212  total_prob = 1.0;
213  }
214 }
215 
216 FixedEnvelope FixedEnvelope::LinearCombination(const std::vector<const FixedEnvelope*>& spectra, const std::vector<double>& intensities)
217 {
218  return LinearCombination(spectra.data(), intensities.data(), spectra.size());
219 }
220 
221 FixedEnvelope FixedEnvelope::LinearCombination(const FixedEnvelope* const * spectra, const double* intensities, size_t size)
222 {
223  size_t ret_size = 0;
224  for(size_t ii = 0; ii < size; ii++)
225  ret_size += spectra[ii]->_confs_no;
226 
227  double* newprobs = reinterpret_cast<double*>(malloc(sizeof(double)*ret_size));
228  if(newprobs == nullptr)
229  throw std::bad_alloc();
230  double* newmasses = reinterpret_cast<double*>(malloc(sizeof(double)*ret_size));
231  if(newmasses == nullptr)
232  {
233  free(newprobs);
234  throw std::bad_alloc();
235  }
236 
237  size_t cntr = 0;
238  for(size_t ii = 0; ii < size; ii++)
239  {
240  double mul = intensities[ii];
241  for(size_t jj = 0; jj < spectra[ii]->_confs_no; jj++)
242  newprobs[jj+cntr] = spectra[ii]->_probs[jj] * mul;
243  memcpy(newmasses + cntr, spectra[ii]->_masses, sizeof(double) * spectra[ii]->_confs_no);
244  cntr += spectra[ii]->_confs_no;
245  }
246  return FixedEnvelope(newmasses, newprobs, cntr);
247 }
248 
249 double FixedEnvelope::WassersteinDistance(FixedEnvelope& other)
250 {
251  double ret = 0.0;
252  if((get_total_prob()*0.999 > other.get_total_prob()) || (other.get_total_prob() > get_total_prob()*1.001))
253  throw std::logic_error("Spectra must be normalized before computing Wasserstein Distance");
254 
255  if(_confs_no == 0 || other._confs_no == 0)
256  return 0.0;
257 
258  sort_by_mass();
259  other.sort_by_mass();
260 
261  size_t idx_this = 0;
262  size_t idx_other = 0;
263 
264  double acc_prob = 0.0;
265  double last_point = 0.0;
266 
267 
268  while(idx_this < _confs_no && idx_other < other._confs_no)
269  {
270  if(_masses[idx_this] < other._masses[idx_other])
271  {
272  ret += (_masses[idx_this] - last_point) * std::abs(acc_prob);
273  acc_prob += _probs[idx_this];
274  last_point = _masses[idx_this];
275  idx_this++;
276  }
277  else
278  {
279  ret += (other._masses[idx_other] - last_point) * std::abs(acc_prob);
280  acc_prob -= other._probs[idx_other];
281  last_point = other._masses[idx_other];
282  idx_other++;
283  }
284  }
285 
286  acc_prob = std::abs(acc_prob);
287 
288  while(idx_this < _confs_no)
289  {
290  ret += (_masses[idx_this] - last_point) * acc_prob;
291  acc_prob -= _probs[idx_this];
292  last_point = _masses[idx_this];
293  idx_this++;
294  }
295 
296  while(idx_other < other._confs_no)
297  {
298  ret += (other._masses[idx_other] - last_point) * acc_prob;
299  acc_prob -= other._probs[idx_other];
300  last_point = other._masses[idx_other];
301  idx_other++;
302  }
303 
304  return ret;
305 }
306 
307 
308 double FixedEnvelope::OrientedWassersteinDistance(FixedEnvelope& other)
309 {
310  double ret = 0.0;
311  if((get_total_prob()*0.999 > other.get_total_prob()) || (other.get_total_prob() > get_total_prob()*1.001))
312  throw std::logic_error("Spectra must be normalized before computing Wasserstein Distance");
313 
314  if(_confs_no == 0 || other._confs_no == 0)
315  return 0.0;
316 
317  sort_by_mass();
318  other.sort_by_mass();
319 
320  size_t idx_this = 0;
321  size_t idx_other = 0;
322 
323  double acc_prob = 0.0;
324  double last_point = 0.0;
325 
326 
327  while(idx_this < _confs_no && idx_other < other._confs_no)
328  {
329  if(_masses[idx_this] < other._masses[idx_other])
330  {
331  ret += (_masses[idx_this] - last_point) * acc_prob;
332  acc_prob += _probs[idx_this];
333  last_point = _masses[idx_this];
334  idx_this++;
335  }
336  else
337  {
338  ret += (other._masses[idx_other] - last_point) * acc_prob;
339  acc_prob -= other._probs[idx_other];
340  last_point = other._masses[idx_other];
341  idx_other++;
342  }
343  }
344 
345  while(idx_this < _confs_no)
346  {
347  ret += (_masses[idx_this] - last_point) * acc_prob;
348  acc_prob -= _probs[idx_this];
349  last_point = _masses[idx_this];
350  idx_this++;
351  }
352 
353  while(idx_other < other._confs_no)
354  {
355  ret += (other._masses[idx_other] - last_point) * acc_prob;
356  acc_prob -= other._probs[idx_other];
357  last_point = other._masses[idx_other];
358  idx_other++;
359  }
360 
361  return ret;
362 }
363 
364 FixedEnvelope FixedEnvelope::bin(double bin_width, double middle)
365 {
366  sort_by_mass();
367 
368  FixedEnvelope ret;
369 
370  if(_confs_no == 0)
371  return ret;
372 
373  ret.reallocate_memory<false>(ISOSPEC_INIT_TABLE_SIZE);
374  ret.current_size = ISOSPEC_INIT_TABLE_SIZE;
375 
376  size_t ii = 0;
377 
378  double half_width = 0.5*bin_width;
379  double hwmm = half_width-middle;
380 
381  while(ii < _confs_no)
382  {
383  double current_bin_middle = floor((_masses[ii]+hwmm)/bin_width)*bin_width + middle;
384  double current_bin_end = current_bin_middle + half_width;
385  double bin_prob = 0.0;
386 
387  while(ii < _confs_no && _masses[ii] <= current_bin_end)
388  {
389  bin_prob += _probs[ii];
390  ii++;
391  }
392  ret.store_conf(current_bin_middle, bin_prob);
393  }
394 
395  return ret;
396 }
397 
398 template<bool tgetConfs> void FixedEnvelope::reallocate_memory(size_t new_size)
399 {
400  // FIXME: Handle overflow gracefully here. It definitely could happen for people still stuck on 32 bits...
401  _masses = reinterpret_cast<double*>(realloc(_masses, new_size * sizeof(double)));
402  if(_masses == nullptr)
403  throw std::bad_alloc();
404  tmasses = _masses + _confs_no;
405 
406  _probs = reinterpret_cast<double*>(realloc(_probs, new_size * sizeof(double)));
407  if(_probs == nullptr)
408  throw std::bad_alloc();
409  tprobs = _probs + _confs_no;
410 
411  constexpr_if(tgetConfs)
412  {
413  _confs = reinterpret_cast<int*>(realloc(_confs, new_size * allDimSizeofInt));
414  if(_confs == nullptr)
415  throw std::bad_alloc();
416  tconfs = _confs + (allDim * _confs_no);
417  }
418 }
419 
420 void FixedEnvelope::slow_reallocate_memory(size_t new_size)
421 {
422  // FIXME: Handle overflow gracefully here. It definitely could happen for people still stuck on 32 bits...
423  _masses = reinterpret_cast<double*>(realloc(_masses, new_size * sizeof(double)));
424  if(_masses == nullptr)
425  throw std::bad_alloc();
426  tmasses = _masses + _confs_no;
427 
428  _probs = reinterpret_cast<double*>(realloc(_probs, new_size * sizeof(double)));
429  if(_probs == nullptr)
430  throw std::bad_alloc();
431  tprobs = _probs + _confs_no;
432 
433  if(_confs != nullptr)
434  {
435  _confs = reinterpret_cast<int*>(realloc(_confs, new_size * allDimSizeofInt));
436  if(_confs == nullptr)
437  throw std::bad_alloc();
438  tconfs = _confs + (allDim * _confs_no);
439  }
440 }
441 
442 template<bool tgetConfs> void FixedEnvelope::threshold_init(Iso&& iso, double threshold, bool absolute)
443 {
444  IsoThresholdGenerator generator(std::move(iso), threshold, absolute);
445 
446  size_t tab_size = generator.count_confs();
447  this->allDim = generator.getAllDim();
448  this->allDimSizeofInt = this->allDim * sizeof(int);
449 
450  this->reallocate_memory<tgetConfs>(tab_size);
451 
452  double* ttmasses = this->_masses;
453  double* ttprobs = this->_probs;
454  ISOSPEC_MAYBE_UNUSED int* ttconfs;
455  constexpr_if(tgetConfs)
456  ttconfs = _confs;
457 
458  while(generator.advanceToNextConfiguration())
459  {
460  *ttmasses = generator.mass(); ttmasses++;
461  *ttprobs = generator.prob(); ttprobs++;
462  constexpr_if(tgetConfs) { generator.get_conf_signature(ttconfs); ttconfs += allDim; }
463  }
464 
465  this->_confs_no = tab_size;
466 }
467 
468 template void FixedEnvelope::threshold_init<true>(Iso&& iso, double threshold, bool absolute);
469 template void FixedEnvelope::threshold_init<false>(Iso&& iso, double threshold, bool absolute);
470 
471 
472 template<bool tgetConfs> void FixedEnvelope::total_prob_init(Iso&& iso, double target_total_prob, bool optimize)
473 {
474  if(target_total_prob <= 0.0)
475  return;
476 
477  if(target_total_prob >= 1.0)
478  {
479  threshold_init<tgetConfs>(std::move(iso), 0.0, true);
480  return;
481  }
482 
483  current_size = ISOSPEC_INIT_TABLE_SIZE;
484 
485  IsoLayeredGenerator generator(std::move(iso), 1000, 1000, true, std::min<double>(target_total_prob, 0.9999));
486 
487  this->allDim = generator.getAllDim();
488  this->allDimSizeofInt = this->allDim*sizeof(int);
489 
490 
491  this->reallocate_memory<tgetConfs>(ISOSPEC_INIT_TABLE_SIZE);
492 
493  size_t last_switch = 0;
494  double prob_at_last_switch = 0.0;
495  double prob_so_far = 0.0;
496  double layer_delta;
497 
498  const double sum_above = log1p(-target_total_prob) - 2.3025850929940455; // log(0.1);
499 
500  do
501  { // Store confs until we accumulate more prob than needed - and, if optimizing,
502  // store also the rest of the last layer
503  while(generator.advanceToNextConfigurationWithinLayer())
504  {
505  this->template addConfILG<tgetConfs>(generator);
506  prob_so_far += *(tprobs-1); // The just-stored probability
507  if(prob_so_far >= target_total_prob)
508  {
509  if (optimize)
510  {
511  while(generator.advanceToNextConfigurationWithinLayer())
512  this->template addConfILG<tgetConfs>(generator);
513  break;
514  }
515  else
516  return;
517  }
518  }
519  if(prob_so_far >= target_total_prob)
520  break;
521 
522  last_switch = this->_confs_no;
523  prob_at_last_switch = prob_so_far;
524 
525  layer_delta = sum_above - log1p(-prob_so_far);
526  layer_delta = (std::max)((std::min)(layer_delta, -0.1), -5.0);
527  } while(generator.nextLayer(layer_delta));
528 
529  if(!optimize || prob_so_far <= target_total_prob)
530  return;
531 
532  // Right. We have extra configurations and we have been asked to produce an optimal p-set, so
533  // now we shall trim unneeded configurations, using an algorithm dubbed "quicktrim"
534  // - similar to the quickselect algorithm, except that we use the cumulative sum of elements
535  // left of pivot to decide whether to go left or right, instead of the positional index.
536  // We'll be sorting by the prob array, permuting the other ones in parallel.
537 
538  int* conf_swapspace = nullptr;
539  constexpr_if(tgetConfs)
540  conf_swapspace = reinterpret_cast<int*>(malloc(this->allDimSizeofInt));
541 
542  size_t start = last_switch;
543  size_t end = this->_confs_no;
544  double sum_to_start = prob_at_last_switch;
545 
546  while(start < end)
547  {
548  // Partition part
549  size_t len = end - start;
550 #if ISOSPEC_BUILDING_R
551  size_t pivot = len/2 + start;
552 #else
553  size_t pivot = random_gen() % len + start; // Using Mersenne twister directly - we don't
554  // need a very uniform distribution just for pivot
555  // selection
556 #endif
557  double pprob = this->_probs[pivot];
558  swap<tgetConfs>(pivot, end-1, conf_swapspace);
559 
560  double new_csum = sum_to_start;
561 
562  size_t loweridx = start;
563  for(size_t ii = start; ii < end-1; ii++)
564  if(this->_probs[ii] > pprob)
565  {
566  swap<tgetConfs>(ii, loweridx, conf_swapspace);
567  new_csum += this->_probs[loweridx];
568  loweridx++;
569  }
570 
571  swap<tgetConfs>(end-1, loweridx, conf_swapspace);
572 
573  // Selection part
574  if(new_csum < target_total_prob)
575  {
576  start = loweridx + 1;
577  sum_to_start = new_csum + this->_probs[loweridx];
578  }
579  else
580  end = loweridx;
581  }
582 
583  constexpr_if(tgetConfs)
584  free(conf_swapspace);
585 
586  if(end <= current_size/2)
587  // Overhead in memory of 2x or more, shrink to fit
588  this->template reallocate_memory<tgetConfs>(end);
589 
590  this->_confs_no = end;
591 }
592 
593 template void FixedEnvelope::total_prob_init<true>(Iso&& iso, double target_total_prob, bool optimize);
594 template void FixedEnvelope::total_prob_init<false>(Iso&& iso, double target_total_prob, bool optimize);
595 
596 
597 double FixedEnvelope::empiric_average_mass()
598 {
599  double ret = 0.0;
600  for(size_t ii = 0; ii < _confs_no; ii++)
601  {
602  ret += _masses[ii] * _probs[ii];
603  }
604  return ret / get_total_prob();
605 }
606 
607 double FixedEnvelope::empiric_variance()
608 {
609  double ret = 0.0;
610  double avg = empiric_average_mass();
611  for(size_t ii = 0; ii < _confs_no; ii++)
612  {
613  double msq = _masses[ii] - avg;
614  ret += msq * msq * _probs[ii];
615  }
616 
617  return ret / get_total_prob();
618 }
619 
620 } // namespace IsoSpec
IsoSpec
Definition: allocator.cpp:20