Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Spectrum feature generator #178

Draft
wants to merge 21 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
fdceeba
initial commit
ArthurDeclercq Feb 24, 2024
5374ed8
finalize ms2 feature generation
ArthurDeclercq Feb 25, 2024
60207a3
add rustyms
ArthurDeclercq Feb 25, 2024
ae39844
remove exit statement fixed IM required value
ArthurDeclercq Feb 26, 2024
9b98c4d
change logger.info to debug
ArthurDeclercq Feb 26, 2024
5e45756
added profile decorator to get timings for functions
ArthurDeclercq Feb 26, 2024
304777c
removed profile as standard rescore debug statement
ArthurDeclercq Feb 26, 2024
95ee475
added new basic features
ArthurDeclercq Feb 26, 2024
73f4573
fixes for ms2 feature generator, removed multiprocessing
ArthurDeclercq Feb 26, 2024
947233e
return empty list on parsing error with rustyms, removed multiprocessing
ArthurDeclercq Feb 28, 2024
24ce565
add deeplc_calibration psm set
ArthurDeclercq Mar 15, 2024
114b006
Merge branch 'timsRescore' of https://github.com/compomics/ms2rescore…
ArthurDeclercq Apr 17, 2024
33c38b0
remove unused import
ArthurDeclercq Apr 17, 2024
40425c7
Merge branch 'timsRescore' of https://github.com/compomics/ms2rescore…
ArthurDeclercq Apr 19, 2024
b810b8c
Merge branch 'timsRescore' of https://github.com/compomics/ms2rescore…
ArthurDeclercq Apr 19, 2024
69b5d1a
Merge tag 'main' of https://github.com/compomics/ms2rescore into spec…
ArthurDeclercq Aug 16, 2024
6e2d102
Merge pull request #177 from compomics/main
ArthurDeclercq Aug 16, 2024
11fdc51
integrate mumble into ms2branch
ArthurDeclercq Aug 21, 2024
3140c44
Merge remote-tracking branch 'origin/main' into spectrum-feature-gene…
ArthurDeclercq Sep 23, 2024
883169a
temp removal of sage features before rescoring
ArthurDeclercq Sep 27, 2024
97865e7
Merge branch 'main' of https://github.com/compomics/ms2rescore into s…
ArthurDeclercq Sep 27, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ms2rescore/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,9 @@ class RescoringError(MS2RescoreError):
"""Error while rescoring PSMs."""

pass


class ParseSpectrumError(MS2RescoreError):
"""Error while rescoring PSMs."""

pass
2 changes: 2 additions & 0 deletions ms2rescore/feature_generators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ms2rescore.feature_generators.maxquant import MaxQuantFeatureGenerator
from ms2rescore.feature_generators.ms2pip import MS2PIPFeatureGenerator
from ms2rescore.feature_generators.im2deep import IM2DeepFeatureGenerator
from ms2rescore.feature_generators.ms2 import MS2FeatureGenerator

FEATURE_GENERATORS = {
"basic": BasicFeatureGenerator,
Expand All @@ -16,4 +17,5 @@
"maxquant": MaxQuantFeatureGenerator,
"ionmob": IonMobFeatureGenerator,
"im2deep": IM2DeepFeatureGenerator,
"ms2": MS2FeatureGenerator,
}
13 changes: 13 additions & 0 deletions ms2rescore/feature_generators/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def add_features(self, psm_list: PSMList) -> None:
charge_states = np.array([psm.peptidoform.precursor_charge for psm in psm_list])
precursor_mzs = psm_list["precursor_mz"]
scores = psm_list["score"]
peptide_lengths = np.array([len(psm.peptidoform.sequence) for psm in psm_list])

has_charge = None not in charge_states
has_mz = None not in precursor_mzs and has_charge
Expand All @@ -74,13 +75,25 @@ def add_features(self, psm_list: PSMList) -> None:
if has_score:
self._feature_names.append("search_engine_score")

if has_mz and has_charge:
experimental_mass = (precursor_mzs * charge_n) - (charge_n * 1.007276466812)
theoretical_mass = (theo_mz * charge_n) - (charge_n * 1.007276466812)
mass_error = experimental_mass - theoretical_mass
self._feature_names.extend(["theoretical_mass", "experimental_mass", "mass_error"])

self._feature_names.append("pep_len")

for i, psm in enumerate(psm_list):
psm.rescoring_features.update(
dict(
**{"charge_n": charge_n[i]} if has_charge else {},
**charge_one_hot[i] if has_charge else {},
**{"abs_ms1_error_ppm": abs_ms1_error_ppm[i]} if has_mz else {},
**{"search_engine_score": scores[i]} if has_score else {},
**{"theoretical_mass": theoretical_mass[i]} if has_mz and has_charge else {},
**{"experimental_mass": experimental_mass[i]} if has_mz and has_charge else {},
**{"mass_error": mass_error[i]} if has_mz and has_charge else {},
**{"pep_len": peptide_lengths[i]},
)
)

Expand Down
15 changes: 12 additions & 3 deletions ms2rescore/feature_generators/deeplc.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

import numpy as np
from psm_utils import PSMList
from psm_utils.io import read_file

from ms2rescore.feature_generators.base import FeatureGeneratorBase

Expand All @@ -40,6 +41,7 @@ def __init__(
*args,
lower_score_is_better: bool = False,
calibration_set_size: Union[int, float, None] = None,
calibration_set: Union[str, None] = None,
processes: int = 1,
**kwargs,
) -> None:
Expand Down Expand Up @@ -71,6 +73,7 @@ def __init__(

self.lower_psm_score_better = lower_score_is_better
self.calibration_set_size = calibration_set_size
self.calibration_set = calibration_set
self.processes = processes
self.deeplc_kwargs = kwargs or {}

Expand Down Expand Up @@ -120,7 +123,9 @@ def add_features(self, psm_list: PSMList) -> None:
# Run DeepLC for each spectrum file
current_run = 1
total_runs = sum(len(runs) for runs in psm_dict.values())

if self.calibration_set:
logger.info("Loading calibration set...")
self.calibration_set = read_file(self.calibration_set, filetype="tsv")
for runs in psm_dict.values():
# Reset DeepLC predictor for each collection of runs
self.deeplc_predictor = None
Expand All @@ -143,8 +148,12 @@ def add_features(self, psm_list: PSMList) -> None:
) if not self._verbose else contextlib.nullcontext():
# Make new PSM list for this run (chain PSMs per spectrum to flat list)
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))

psm_list_calibration = self._get_calibration_psms(psm_list_run)
if self.calibration_set:
psm_list_calibration = self.calibration_set[
self.calibration_set["run"] == run
]
else:
psm_list_calibration = self._get_calibration_psms(psm_list_run)
logger.debug(f"Calibrating DeepLC with {len(psm_list_calibration)} PSMs...")
self.deeplc_predictor = self.DeepLC(
n_jobs=self.processes,
Expand Down
259 changes: 259 additions & 0 deletions ms2rescore/feature_generators/ms2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
"""
MS2-based feature generator.

"""

import logging
import re
from typing import List, Optional, Union
from itertools import chain
from collections import defaultdict


import numpy as np
from psm_utils import PSMList
from pyteomics import mass, mzml, mgf
from rustyms import RawSpectrum, LinearPeptide, FragmentationModel, MassMode

from ms2rescore.feature_generators.base import FeatureGeneratorBase
from ms2rescore.utils import infer_spectrum_path
from ms2rescore.exceptions import ParseSpectrumError

logger = logging.getLogger(__name__)

FRAGMENTATION_MODELS = {
"cidhcd": FragmentationModel.CidHcd,
"etcid": FragmentationModel.Etcid,
"etd": FragmentationModel.Etd,
"ethcd": FragmentationModel.Ethcd,
"all": FragmentationModel.All,
}
MASS_MODES = {
"average": MassMode.Average,
"monoisotopic": MassMode.Monoisotopic,
}


class MS2FeatureGenerator(FeatureGeneratorBase):
"""DeepLC retention time-based feature generator."""

def __init__(
self,
*args,
spectrum_path: Optional[str] = None,
spectrum_id_pattern: str = "(.*)",
fragmentation_model: str = "All",
mass_mode: str = "Monoisotopic",
processes: int = 1,
**kwargs,
) -> None:
"""
Generate MS2-based features for rescoring.

Parameters
----------

Attributes
----------
feature_names: list[str]
Names of the features that will be added to the PSMs.

"""
super().__init__(*args, **kwargs)

self.spectrum_path = spectrum_path
self.spectrum_id_pattern = spectrum_id_pattern
self.fragmentation_model = FRAGMENTATION_MODELS[fragmentation_model.lower()]
self.mass_mode = MASS_MODES[mass_mode.lower()]
self.processes = processes

@property
def feature_names(self) -> List[str]:
return [
"ln_explained_intensity",
"ln_total_intensity",
"ln_explained_intensity_ratio",
"ln_explained_b_ion_ratio",
"ln_explained_y_ion_ratio",
"longest_b_ion_sequence",
"longest_y_ion_sequence",
"matched_b_ions",
"matched_b_ions_pct",
"matched_y_ions",
"matched_y_ions_pct",
"matched_ions_pct",
]

def add_features(self, psm_list: PSMList) -> None:
"""Add MS2-derived features to PSMs."""

logger.info("Adding MS2-derived features to PSMs.")
psm_dict = psm_list.get_psm_dict()
current_run = 1
total_runs = sum(len(runs) for runs in psm_dict.values())

for runs in psm_dict.values():
for run, psms in runs.items():
logger.info(
f"Running MS2 for PSMs from run ({current_run}/{total_runs}) `{run}`..."
)
psm_list_run = PSMList(psm_list=list(chain.from_iterable(psms.values())))
spectrum_filename = infer_spectrum_path(self.spectrum_path, run)

self._calculate_features(psm_list_run, spectrum_filename)
current_run += 1

def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
"""Retrieve annotated spectra for all psms."""

spectrum_reader = self._read_spectrum_file(spectrum_file)

spectrum_id_pattern = re.compile(
self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)"
)

try:
mapper = {
spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id
for spectrum_id in spectrum_reader._offset_index.mapping["spectrum"].keys()
}
except AttributeError:
raise ParseSpectrumError(
"Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
)
annotated_spectra = [
self._annotate_spectrum(psm, spectrum_reader.get_by_id(mapper[psm.spectrum_id]))
for psm in psm_list
]
for psm, annotated_spectrum in zip(psm_list, annotated_spectra):
psm.rescoring_features.update(
self._calculate_spectrum_features(psm, annotated_spectrum)
)
# with multiprocessing.Pool(self.processes) as pool:
# counts_failed = 0
# for psm, features in zip(
# psm_list,
# track(
# pool.imap(
# self._calculate_spectrum_features_wrapper,
# zip(psm_list, annotated_spectra),
# chunksize=1000,
# ),
# total=len(psm_list),
# description="Calculating MS2 features...",
# transient=True,
# ),
# ):
# if features:
# psm.rescoring_features.update(features)

# else:
# counts_failed += 1
# if counts_failed > 0:
# logger.warning(f"Failed to calculate features for {counts_failed} PSMs")

@staticmethod
def _read_spectrum_file(spectrum_filepath: str) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:

if spectrum_filepath.suffix.lower() == ".mzml":
return mzml.PreIndexedMzML(str(spectrum_filepath))
elif spectrum_filepath.suffix.lower() == ".mgf":
return mgf.IndexedMGF(str(spectrum_filepath))

@staticmethod
def _longest_ion_sequence(lst):
max_sequence = 0
current_sequence = 0

for value in lst:
current_sequence = current_sequence + 1 if value else 0
max_sequence = max(max_sequence, current_sequence)

return max_sequence

def _calculate_spectrum_features(self, psm, annotated_spectrum):

if not annotated_spectrum:
return {}

features = defaultdict(list)
b_ions_matched = [False] * (len(psm.peptidoform.sequence))
y_ions_matched = [False] * (len(psm.peptidoform.sequence))

pseudo_count = 0.00001
ion_fragment_regex = re.compile(r"\d+")

for peak in annotated_spectrum:
features["total_intensity"].append(peak.intensity)

if peak.annotation:
features["matched_intensity"].append(peak.intensity)
for matched_ion in peak.annotation:
if "y" in matched_ion.ion:
features["y_ion_matched"].append(peak.intensity)
y_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = (
True
)
elif "b" in matched_ion.ion:
features["b_ion_matched"].append(peak.intensity)
b_ions_matched[int(ion_fragment_regex.search(matched_ion.ion).group())] = (
True
)

total_intensity_sum = np.sum(features["total_intensity"])
matched_intensity_sum = np.sum(features["matched_intensity"])
b_ion_matched_sum = np.sum(features["b_ion_matched"])
y_ion_matched_sum = np.sum(features["y_ion_matched"])

return {
"ln_explained_intensity": np.log(matched_intensity_sum + pseudo_count),
"ln_total_intensity": np.log(total_intensity_sum + pseudo_count),
"ln_explained_intensity_ratio": np.log(
matched_intensity_sum / total_intensity_sum + pseudo_count
),
"ln_explained_b_ion_ratio": np.log(
b_ion_matched_sum / matched_intensity_sum + pseudo_count
),
"ln_explained_y_ion_ratio": np.log(
y_ion_matched_sum / matched_intensity_sum + pseudo_count
),
"longest_b_ion_sequence": self._longest_ion_sequence(b_ions_matched),
"longest_y_ion_sequence": self._longest_ion_sequence(y_ions_matched),
"matched_b_ions": sum(b_ions_matched),
"matched_b_ions_pct": sum(b_ions_matched) / len(b_ions_matched),
"matched_y_ions": sum(y_ions_matched),
"matched_y_ions_pct": sum(y_ions_matched) / len(y_ions_matched),
"matched_ions_pct": (sum(b_ions_matched) + sum(y_ions_matched))
/ (len(b_ions_matched) + len(y_ions_matched)),
}

def _calculate_spectrum_features_wrapper(self, psm_spectrum_tuple):
psm, spectrum = psm_spectrum_tuple
return self._calculate_spectrum_features(psm, spectrum)

def _annotate_spectrum(self, psm, pyteomics_spectrum):

spectrum = RawSpectrum(
title=psm.spectrum_id,
num_scans=1,
rt=psm.retention_time,
precursor_charge=psm.get_precursor_charge(),
precursor_mass=mass.calculate_mass(psm.peptidoform.composition),
mz_array=pyteomics_spectrum["m/z array"],
intensity_array=pyteomics_spectrum["intensity array"],
)
try:
annotated_spectrum = spectrum.annotate(
peptide=LinearPeptide(psm.peptidoform.proforma.split("/")[0]),
model=self.fragmentation_model,
mode=self.mass_mode,
)
except: # noqa E722
return []

return annotated_spectrum.spectrum


# TODO: keep this here?
def modification_evidence():
return
1 change: 1 addition & 0 deletions ms2rescore/package_data/config_default.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
},
"maxquant": {}
},
"psm_generator": {},
"rescoring_engine": {
"mokapot": {
"train_fdr": 0.01,
Expand Down
Loading
Loading