import msmu as mm
import pandas as pd
base_dir = "https://raw.githubusercontent.com/bertis-informatics/msmu/refs/heads/main/data/sage_tmt"
sage_idents = f"{base_dir}/sage/results.sage.tsv"
sage_quants = f"{base_dir}/sage/tmt.tsv"
meta = f"{base_dir}/meta.csv"
ptm_mdata = mm.read_sage(identification_file=sage_idents, quantification_file=sage_quants, label="tmt")
meta_df = pd.read_csv("https://raw.githubusercontent.com/bertis-informatics/msmu/refs/heads/main/data/sage_tmt/meta.csv")
meta_df = meta_df.set_index("tag") # set the index to match sample id in ptm_mdata.obs
ptm_mdata.obs = ptm_mdata.obs.join(meta_df)
ptm_mdata.push_obs() # update all modalities with the new obs data
ptm_mdata = mm.pp.add_filter(ptm_mdata, modality="psm", column="q_value", keep="lt", value=0.01)
ptm_mdata = mm.pp.add_filter(ptm_mdata, modality="psm", column="proteins", keep="not_contains", value="contam_")
ptm_mdata = mm.pp.apply_filter(ptm_mdata, modality="psm")
ptm_mdata = mm.pp.to_peptide(ptm_mdata)
ptm_mdata = mm.pp.log2_transform(ptm_mdata, modality="peptide")
ptm_mdata = mm.pp.normalise(ptm_mdata, modality="peptide", method="median")
ptm_mdata = mm.pp.add_filter(ptm_mdata, modality="peptide", column="q_value", keep="lt", value=0.01)
ptm_mdata = mm.pp.apply_filter(ptm_mdata, modality="peptide")
ptm_mdata
INFO - Identification file loaded: (5000, 40) INFO - Quantification file loaded: (5000, 9) INFO - Decoy entries separated: (1195, 13) WARNING - Purity column not found in psm modality for TMT data. Skipping purity filtering. INFO - Peptide-level identifications: 2204 (2151 at 1% FDR)
Building new peptide quantification data.
MuData object with n_obs × n_vars = 6 × 4415
obs: 'set', 'sample_id', 'sample_name', 'condition'
uns: '_cmd'
2 modalities
psm: 6 x 2264
obs: 'set', 'sample_id', 'sample_name', 'condition'
var: 'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value'
uns: 'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
varm: 'search_result', 'filter'
peptide: 6 x 2151
obs: 'set', 'sample_id', 'sample_name', 'condition'
var: 'peptide', 'proteins', 'stripped_peptide', 'count_psm', 'PEP', 'q_value'
uns: 'level', 'decoy', 'filter', 'decoy_filter'
varm: 'filter'
Global Protein Data Processing¶
For real data analysis, global protein data should be processed with other batch experiment for global protein quantification. Here, we will use the same SAGE TMT data for demonstration purpose.
global_mdata = ptm_mdata.copy()
global_mdata = mm.pp.infer_protein(global_mdata)
global_mdata = mm.pp.to_protein(global_mdata)
global_mdata = mm.pp.add_filter(global_mdata, modality="protein", column="q_value", keep="lt", value=0.01)
global_mdata = mm.pp.apply_filter(global_mdata, modality="protein")
INFO - Starting protein inference INFO - Initial proteins: 1722 INFO - Removed indistinguishable: 175 INFO - Removed subsettable: 62 INFO - Removed subsumable: 0 INFO - Total protein groups: 1485 INFO - Ranking features by 'total_intensity' to select top 3 features. INFO - Protein-level identifications : 1465 (1432 at 1% FDR)
PTM Data Processing¶
Protein Inference with global protein data¶
PTM data uses the same peptide-protein mapping as global protein data, therefore we can directly use the peptide to protein mapping from global protein data processing step by using mm.pp.infer_protein function with progagated_from parameter.
ptm_mdata = mm.pp.infer_protein(ptm_mdata, propagated_from=global_mdata)
INFO - Starting protein inference
PTM Summarization¶
For PTM summarization, we need FASTA file that contains the protein sequences used for ptm site localization.
Then, PTM site is summarized by mm.pp.to_ptm with modi_name for specifying modality name (e.g. "oxidation" -> "oxidation_site") for PTM data and modification for specifying the modification string used for site localization in modified peptides.
import urllib.request
fasta_url="https://raw.githubusercontent.com/bertis-informatics/msmu/refs/heads/main/data/fasta/human_fasta.fasta"
fasta_file = "./human_fasta.fasta"
with urllib.request.urlopen(fasta_url) as response:
with open(fasta_file, "wb") as f:
fasta_ = response.read()
f.write(fasta_)
ptm_mdata = mm.utils.attach_fasta(ptm_mdata, fasta_file=fasta_file)
ptm_mdata = mm.pp.to_ptm(ptm_mdata, modi_name="oxidation", modification="[+15.9949]")
ptm_mdata
INFO - Extracted modified peptides: 329 / 2151 INFO - oxidation site level identifications: 351 INFO - Building new oxidation_site AnnData.
MuData object with n_obs × n_vars = 6 × 4766
obs: 'set', 'sample_id', 'sample_name', 'condition'
uns: '_cmd', 'protein_map', 'protein_info'
3 modalities
psm: 6 x 2264
obs: 'set', 'sample_id', 'sample_name', 'condition'
var: 'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value'
uns: 'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
varm: 'search_result', 'filter'
peptide: 6 x 2151
obs: 'set', 'sample_id', 'sample_name', 'condition'
var: 'peptide', 'proteins', 'stripped_peptide', 'count_psm', 'PEP', 'q_value', 'protein_group', 'peptide_type'
uns: 'level', 'decoy', 'filter', 'decoy_filter'
varm: 'filter'
oxidation_site: 6 x 351
obs: 'set', 'sample_id', 'sample_name', 'condition'
var: 'count_psm', 'peptide', 'count_peptide', 'count_stripped_peptide', 'modified_protein', 'protein_group'
uns: 'level'
ptm_mdata["oxidation_site"].to_df().T
| 126 | 127 | 128 | 129 | 130 | 131 | |
|---|---|---|---|---|---|---|
| A0A3B3IRV3|M150,Q9ULC4|M150 | 16.280768 | 15.531237 | 15.376702 | 15.057094 | 15.299921 | 15.772261 |
| O00410|M61 | 14.642673 | 14.665745 | 14.845570 | 14.437368 | 14.755740 | 15.703198 |
| O00422|M120 | 14.847602 | 15.049636 | 14.597786 | 14.460380 | 14.671438 | 14.589479 |
| O14497|M1273 | 13.214325 | 13.392085 | 13.262411 | 13.074445 | 12.827528 | 14.306335 |
| O14950|M40,P19105|M39 | 15.821183 | 15.779716 | 15.662250 | 15.596942 | 15.563856 | 15.759404 |
| ... | ... | ... | ... | ... | ... | ... |
| Q9Y490|M289 | 17.150820 | 17.170126 | 16.605962 | 16.176437 | 15.987995 | 16.259189 |
| Q9Y490|M72 | 14.399083 | 14.174609 | 13.842843 | 13.546905 | 13.453805 | 13.618298 |
| Q9Y5B9|M468 | 14.913293 | 15.107410 | 15.251177 | 14.708296 | 14.658334 | 15.900108 |
| Q9Y639|M379 | 14.101203 | 13.768589 | 13.309624 | 13.224675 | 13.218592 | 13.609742 |
| Q9Y6G3|M108 | 10.560294 | 12.021548 | 10.495958 | 12.244229 | 10.887468 | 11.901794 |
351 rows × 6 columns
PTM quantification adjustment by global protein levels¶
To adjust PTM quantification by global protein levels, we can use mm.pp.adjust_ptm_by_protein function. This function normalizes PTM quantification by corresponding protein levels to remove the confounding effect from protein abundance changes.
ptm_mdata = mm.pp.adjust_ptm_by_protein(
ptm_mdata,
modality="oxidation_site",
global_mdata=global_mdata,
method="ridge",
rescale=True,
)
ptm_mdata["oxidation_site"].to_df().T
| 126 | 127 | 128 | 129 | 130 | 131 | |
|---|---|---|---|---|---|---|
| A0A3B3IRV3|M150,Q9ULC4|M150 | 14.913651 | 14.170947 | 14.017820 | 13.701122 | 13.941738 | 14.409776 |
| O00410|M61 | 14.003172 | 14.013189 | 14.190895 | 13.791744 | 14.112336 | 15.043717 |
| O00422|M120 | 14.337066 | 14.538646 | 14.087810 | 13.950713 | 14.161297 | 14.079522 |
| O14497|M1273 | 14.062327 | 14.237820 | 14.109800 | 13.924230 | 13.680461 | 15.140416 |
| O14950|M40,P19105|M39 | 14.316405 | 14.275079 | 14.157595 | 14.092328 | 14.059105 | 14.254542 |
| ... | ... | ... | ... | ... | ... | ... |
| Q9Y490|M289 | 14.777373 | 14.796433 | 14.239445 | 13.815383 | 13.629338 | 13.897082 |
| Q9Y490|M72 | 14.746868 | 14.522217 | 14.195656 | 13.903682 | 13.812320 | 13.974311 |
| Q9Y5B9|M468 | 14.015269 | 14.218388 | 14.357117 | 13.823126 | 13.764951 | 14.976202 |
| Q9Y639|M379 | 14.751465 | 14.420926 | 13.964825 | 13.880406 | 13.874360 | 14.263071 |
| Q9Y6G3|M108 | 13.411507 | 14.857057 | 13.343002 | 15.075578 | 13.730476 | 14.737434 |
334 rows × 6 columns