Process Proteomics Data from Sage with msmu¶

In [1]:

Copied!





import msmu as mm
from pathlib import Path
import pandas as pd
import requests
import tarfile
import msmu as mm
from pathlib import Path
import pandas as pd
import requests
import tarfile

Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail

In [2]:

Copied!





url = "https://github.com/bertis-informatics/msmu/releases/download/0.2.2/MSV000089280.tar.gz"
meta = "https://github.com/bertis-informatics/msmu/releases/download/0.2.2/meta.csv"
base_dir = Path(url).name.split(".")[0]
sage_idents = f"{base_dir}/results.sage.tsv"
sage_quants = f"{base_dir}/lfq.tsv"

r = requests.get(url)
r.raise_for_status()

with open(Path(url).name, "wb") as f:
    f.write(r.content)

with tarfile.open(Path(url).name, "r:gz") as tar:
    members = [m for m in tar.getmembers() if not Path(m.name).name.startswith("._")]
    tar.extractall(members=members)
url = "https://github.com/bertis-informatics/msmu/releases/download/0.2.2/MSV000089280.tar.gz"
meta = "https://github.com/bertis-informatics/msmu/releases/download/0.2.2/meta.csv"
base_dir = Path(url).name.split(".")[0]
sage_idents = f"{base_dir}/results.sage.tsv"
sage_quants = f"{base_dir}/lfq.tsv"

r = requests.get(url)
r.raise_for_status()

with open(Path(url).name, "wb") as f:
    f.write(r.content)

with tarfile.open(Path(url).name, "r:gz") as tar:
    members = [m for m in tar.getmembers() if not Path(m.name).name.startswith("._")]
    tar.extractall(members=members)

/var/folders/pp/7ts5fh4x5hl81rnn895l34ph0000gn/T/ipykernel_17793/2858270411.py:15: DeprecationWarning: Python 3.14 will, by default, filter extracted tar archives and reject files or modify their metadata. Use the filter argument to control this behavior.
  tar.extractall(members=members)

In [3]:

Copied!

mdata = mm.read_sage(identification_file=sage_idents, quantification_file=sage_quants, label="label_free")
mdata = mm.read_sage(identification_file=sage_idents, quantification_file=sage_quants, label="label_free")

INFO - Identification file loaded: (722655, 40)
INFO - Quantification file loaded: (19530, 112)
INFO - Decoy entries separated: (217399, 15)

In [4]:

Copied!

meta_df = pd.read_csv(meta)
meta_df = meta_df.set_index("sample_id")  # set the index to match sample id in mdata.obs

mdata.obs = mdata.obs.join(meta_df)
mdata.push_obs()  # update all modalities with the new obs data
meta_df = pd.read_csv(meta)
meta_df = meta_df.set_index("sample_id")  # set the index to match sample id in mdata.obs

mdata.obs = mdata.obs.join(meta_df)
mdata.push_obs()  # update all modalities with the new obs data

PSM¶

In [5]:

Copied!

mdata = mm.pp.add_filter(mdata, modality="psm", column="q_value", keep="lt", value=0.01)
mdata = mm.pp.add_filter(mdata, modality="psm", column="proteins", keep="not_contains", value="contam_")
mdata = mm.pp.apply_filter(mdata, modality="psm")

mdata
mdata = mm.pp.add_filter(mdata, modality="psm", column="q_value", keep="lt", value=0.01)
mdata = mm.pp.add_filter(mdata, modality="psm", column="proteins", keep="not_contains", value="contam_")
mdata = mm.pp.apply_filter(mdata, modality="psm")

mdata

Out[5]:

MuData object with n_obs × n_vars = 106 × 267797
  obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
  uns:	'_cmd'
  2 modalities
    psm:	106 x 248267
      obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
      var:	'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value', 'rt', 'calcmass'
      uns:	'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
      varm:	'search_result', 'filter'
    peptide:	106 x 19530
      obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
      uns:	'level'

Peptide¶

In [6]:

Copied!

mdata = mm.pp.to_peptide(mdata)
mdata
mdata = mm.pp.to_peptide(mdata)
mdata

INFO - Peptide-level identifications: 25260 (19769 at 1% FDR)

Using existing peptide quantification data.

Out[6]:

MuData object with n_obs × n_vars = 106 × 248267
  obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
  uns:	'_cmd'
  2 modalities
    psm:	106 x 248267
      obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
      var:	'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value', 'rt', 'calcmass'
      uns:	'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
      varm:	'search_result', 'filter'
    peptide:	106 x 25260
      var:	'peptide', 'proteins', 'stripped_peptide', 'count_psm', 'PEP', 'q_value'
      uns:	'level', 'decoy'

In [7]:

Copied!





mdata["psm"].var["cell"] = mdata["psm"].var["filename"].map(mdata.obs["cell"])
mdata["psm"].uns["decoy"]["cell"] = mdata["psm"].uns["decoy"]["filename"].map(mdata.obs["cell"])

mdata = mm.pp.add_filter(mdata, modality="psm", column="cell", keep="contains", value="C10|SVEC")
mdata = mm.pp.apply_filter(mdata, modality="psm")

mdata = mdata[mdata.obs["cell"].isin(["C10", "SVEC"])].copy()
mdata
mdata["psm"].var["cell"] = mdata["psm"].var["filename"].map(mdata.obs["cell"])
mdata["psm"].uns["decoy"]["cell"] = mdata["psm"].uns["decoy"]["filename"].map(mdata.obs["cell"])

mdata = mm.pp.add_filter(mdata, modality="psm", column="cell", keep="contains", value="C10|SVEC")
mdata = mm.pp.apply_filter(mdata, modality="psm")

mdata = mdata[mdata.obs["cell"].isin(["C10", "SVEC"])].copy()
mdata

Out[7]:

MuData object with n_obs × n_vars = 70 × 99285
  obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
  uns:	'_cmd'
  2 modalities
    psm:	70 x 74025
      obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
      var:	'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value', 'rt', 'calcmass', 'cell'
      uns:	'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
      varm:	'search_result', 'filter'
    peptide:	70 x 25260
      var:	'peptide', 'proteins', 'stripped_peptide', 'count_psm', 'PEP', 'q_value'
      uns:	'level', 'decoy'

Filtering - peptide¶

In [8]:

Copied!





mdata = mm.pp.add_filter(mdata, modality="peptide", column="q_value", keep="lt", value=0.01)
mdata = mm.pp.apply_filter(mdata, modality="peptide")
mdata.mod["peptide"] = mdata["peptide"][:, mdata["peptide"].to_df().dropna(axis=1, how="all").columns]
mdata.update()

mdata
mdata = mm.pp.add_filter(mdata, modality="peptide", column="q_value", keep="lt", value=0.01)
mdata = mm.pp.apply_filter(mdata, modality="peptide")
mdata.mod["peptide"] = mdata["peptide"][:, mdata["peptide"].to_df().dropna(axis=1, how="all").columns]
mdata.update()

mdata

Out[8]:

MuData object with n_obs × n_vars = 70 × 92871
  obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
  uns:	'_cmd'
  2 modalities
    psm:	70 x 74025
      obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
      var:	'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value', 'rt', 'calcmass', 'cell'
      uns:	'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
      varm:	'search_result', 'filter'
    peptide:	70 x 18846
      var:	'peptide', 'proteins', 'stripped_peptide', 'count_psm', 'PEP', 'q_value'
      uns:	'level', 'decoy', 'filter', 'decoy_filter'
      varm:	'filter'

Normalization¶

Here, we log2 transform and normalize the data at the peptide level.

Median centering normalization is applied using mm.pp.normalize() function.

In [ ]:

Copied!

mdata["psm"].layers["raw"] = mdata["psm"].X.copy()

mdata = mm.pp.log2_transform(mdata, modality="peptide")
mdata = mm.pp.normalize(mdata, modality="peptide", method="median")
mdata["psm"].layers["raw"] = mdata["psm"].X.copy()

mdata = mm.pp.log2_transform(mdata, modality="peptide")
mdata = mm.pp.normalize(mdata, modality="peptide", method="median")

/Users/jl/Scripts/msmu/msmu/_preprocessing/_normalise.py:29: ImplicitModificationWarning: Modifying `X` on a view results in data being overridden
  mdata[modality].X = log2_arr
/Users/jl/Scripts/msmu/msmu/_preprocessing/_normalise.py:123: ImplicitModificationWarning: Modifying `X` on a view results in data being overridden
  mdata.mod[modality].X = normalised_arr

Protein inference¶

You can infer protein-level data from peptide-level data using the mm.pp.infer_protein() function.

In [10]:

Copied!

mdata = mm.pp.infer_protein(mdata)
mdata = mm.pp.infer_protein(mdata)

INFO - Starting protein inference
INFO - Initial proteins: 4268
INFO - Removed indistinguishable: 197
INFO - Removed subsettable: 263
INFO - Removed subsumable: 19
INFO - Total protein groups: 3789

Protein¶

In [11]:

Copied!

mdata = mm.pp.to_protein(mdata, top_n=3, rank_method="total_intensity")
mdata = mm.pp.to_protein(mdata, top_n=3, rank_method="total_intensity")

INFO - Ranking features by 'total_intensity' to select top 3 features.
INFO - Protein-level identifications :  3595 (3054 at 1% FDR)

Filtering - protein¶

In [12]:

Copied!





mdata = mm.pp.add_filter(mdata, modality="protein", column="q_value", keep="lt", value=0.01)
mdata = mm.pp.apply_filter(mdata, modality="protein")
mdata.mod["protein"] = mdata["protein"][:, mdata["protein"].to_df().dropna(axis=1, how="all").columns]
mdata.update()

mdata
mdata = mm.pp.add_filter(mdata, modality="protein", column="q_value", keep="lt", value=0.01)
mdata = mm.pp.apply_filter(mdata, modality="protein")
mdata.mod["protein"] = mdata["protein"][:, mdata["protein"].to_df().dropna(axis=1, how="all").columns]
mdata.update()

mdata

Out[12]:

MuData object with n_obs × n_vars = 70 × 95925
  obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
  uns:	'_cmd'
  3 modalities
    psm:	70 x 74025
      obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
      var:	'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value', 'rt', 'calcmass', 'cell'
      uns:	'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
      varm:	'search_result', 'filter'
      layers:	'raw'
    peptide:	70 x 18846
      obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
      var:	'peptide', 'proteins', 'stripped_peptide', 'count_psm', 'PEP', 'q_value', 'protein_group', 'peptide_type'
      uns:	'level', 'decoy', 'filter', 'decoy_filter'
      varm:	'filter'
    protein:	70 x 3054
      obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
      var:	'count_psm', 'count_stripped_peptide', 'PEP', 'q_value'
      uns:	'level', 'decoy', 'filter', 'decoy_filter'
      varm:	'filter'

Imputation¶

In [13]:

Copied!





import matplotlib.pyplot as plt
import pandas as pd
import pimmslearn.plotting as pmp
import pimmslearn.sampling as pms
import pimmslearn.models as pmm

from pimmslearn.plotting.defaults import color_model_mapping
from pimmslearn.sklearn.ae_transformer import AETransformer
from pimmslearn.sklearn.cf_transformer import CollaborativeFilteringTransformer


pmp.make_large_descriptors("8")

index_name: str = "Sample ID"
column_name: str = "protein group"
frac_non_train: float = 0.1
frac_mnar: float = 0.05
random_state: int = 42
import matplotlib.pyplot as plt
import pandas as pd
import pimmslearn.plotting as pmp
import pimmslearn.sampling as pms
import pimmslearn.models as pmm

from pimmslearn.plotting.defaults import color_model_mapping
from pimmslearn.sklearn.ae_transformer import AETransformer
from pimmslearn.sklearn.cf_transformer import CollaborativeFilteringTransformer


pmp.make_large_descriptors("8")

index_name: str = "Sample ID"
column_name: str = "protein group"
frac_non_train: float = 0.1
frac_mnar: float = 0.05
random_state: int = 42

In [14]:

Copied!

df = mdata["protein"].to_df()
df.index.name = "Sample ID"
df.columns.name = "protein group"
df = mdata["protein"].to_df()
df.index.name = "Sample ID"
df.columns.name = "protein group"

Check missing value pattern¶

In [15]:

Copied!

ax = pmp.data.plot_feat_median_over_prop_missing(data=df, type="boxplot")
ax = pmp.data.plot_feat_median_over_prop_missing(data=df, type="boxplot")

/Users/jl/.local/share/virtualenvs/msmu-c3VCzU_G/lib/python3.13/site-packages/pimmslearn/plotting/data.py:327: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  ax = ax[0]  # returned series due to by argument?

No description has been provided for this image

Robust missing value imputation¶

In [16]:

Copied!





df = df.stack().to_frame("intensity")

splits, thresholds, fake_na_mcar, fake_na_mnar = pms.sample_mnar_mcar(
    df_long=df,
    frac_non_train=frac_non_train,
    frac_mnar=frac_mnar,
    random_state=random_state,
)
splits = pms.check_split_integrity(splits)
df = df.stack().to_frame("intensity")

splits, thresholds, fake_na_mcar, fake_na_mnar = pms.sample_mnar_mcar(
    df_long=df,
    frac_non_train=frac_non_train,
    frac_mnar=frac_mnar,
    random_state=random_state,
)
splits = pms.check_split_integrity(splits)

/Users/jl/.local/share/virtualenvs/msmu-c3VCzU_G/lib/python3.13/site-packages/pimmslearn/sampling.py:209: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead
  loc=float(quantile_frac),
/Users/jl/.local/share/virtualenvs/msmu-c3VCzU_G/lib/python3.13/site-packages/pimmslearn/sampling.py:210: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead
  scale=float(0.3 * df_long.std()),

In [17]:

Copied!





cf_model = CollaborativeFilteringTransformer(
    target_column="intensity",
    sample_column="Sample ID",
    item_column="protein group",
    out_folder="runs/scikit_interface",
)

cf_model.fit(splits.train_X, splits.val_y, cuda=False, epochs_max=20)
cf_model = CollaborativeFilteringTransformer(
    target_column="intensity",
    sample_column="Sample ID",
    item_column="protein group",
    out_folder="runs/scikit_interface",
)

cf_model.fit(splits.train_X, splits.val_y, cuda=False, epochs_max=20)

suggested_lr.valley = 0.00525

epoch	train_loss	valid_loss	time
0	6.320165	6.174103	00:00
1	5.522466	4.395256	00:00
2	3.587925	2.049297	00:00
3	2.431448	1.833395	00:00
4	1.831739	1.518024	00:00
5	1.435816	1.355048	00:00
6	1.189103	1.294942	00:00
7	1.034226	1.249249	00:00
8	0.919461	1.232482	00:00
9	0.839021	1.222002	00:00
10	0.773871	1.213310	00:00
11	0.722318	1.214855	00:00

No improvement since epoch 10: early stopping

Out[17]:

CollaborativeFilteringTransformer(item_column='protein group',
                                  out_folder=Path('runs/scikit_interface'),
                                  sample_column='Sample ID',
                                  target_column='intensity')

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [18]:

Copied!

df_imputed = cf_model.transform(df).unstack()
assert df_imputed.isna().sum().sum() == 0
df_imputed = cf_model.transform(df).unstack()
assert df_imputed.isna().sum().sum() == 0

Check imputation results¶

In [19]:

Copied!





df_imputed = df_imputed.stack()  # long-format
observed = df_imputed.loc[df.index]
imputed = df_imputed.loc[df_imputed.index.difference(df.index)]
df_imputed = df_imputed.unstack()  # back to wide-format
# some checks
assert len(df) == len(observed)
assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)

fig, axes = plt.subplots(2, figsize=(8, 4))

min_max = pmp.data.get_min_max_iterable([observed, imputed])
label_template = "{method} (N={n:,d})"
ax, _ = pmp.data.plot_histogram_intensities(
    observed,
    ax=axes[0],
    min_max=min_max,
    label=label_template.format(
        method="measured",
        n=len(observed),
    ),
    color="grey",
    alpha=1,
)
_ = ax.legend()
ax, _ = pmp.data.plot_histogram_intensities(
    imputed,
    ax=axes[1],
    min_max=min_max,
    label=label_template.format(
        method="CF imputed",
        n=len(imputed),
    ),
    color=color_model_mapping["CF"],
    alpha=1,
)
_ = ax.legend()
df_imputed = df_imputed.stack()  # long-format
observed = df_imputed.loc[df.index]
imputed = df_imputed.loc[df_imputed.index.difference(df.index)]
df_imputed = df_imputed.unstack()  # back to wide-format
# some checks
assert len(df) == len(observed)
assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)

fig, axes = plt.subplots(2, figsize=(8, 4))

min_max = pmp.data.get_min_max_iterable([observed, imputed])
label_template = "{method} (N={n:,d})"
ax, _ = pmp.data.plot_histogram_intensities(
    observed,
    ax=axes[0],
    min_max=min_max,
    label=label_template.format(
        method="measured",
        n=len(observed),
    ),
    color="grey",
    alpha=1,
)
_ = ax.legend()
ax, _ = pmp.data.plot_histogram_intensities(
    imputed,
    ax=axes[1],
    min_max=min_max,
    label=label_template.format(
        method="CF imputed",
        n=len(imputed),
    ),
    color=color_model_mapping["CF"],
    alpha=1,
)
_ = ax.legend()

In [20]:

Copied!

splits.to_wide_format()
splits.val_y = pd.DataFrame(pd.NA, index=splits.train_X.index, columns=splits.train_X.columns).fillna(splits.val_y)
splits.to_wide_format()
splits.val_y = pd.DataFrame(pd.NA, index=splits.train_X.index, columns=splits.train_X.columns).fillna(splits.val_y)

Imputation with denoising autoencoder¶

In [21]:

Copied!





model_selected = "DAE"

model = AETransformer(
    model=model_selected,
    hidden_layers=[512],
    latent_dim=50,
    out_folder="runs/scikit_interface",
    batch_size=10,
)
model_selected = "DAE"

model = AETransformer(
    model=model_selected,
    hidden_layers=[512],
    latent_dim=50,
    out_folder="runs/scikit_interface",
    batch_size=10,
)

In [22]:

Copied!

model.fit(splits.train_X, splits.val_y, epochs_max=50, cuda=False)
model.fit(splits.train_X, splits.val_y, epochs_max=50, cuda=False)

epoch	train_loss	valid_loss	time
0	31082.289062	1663.821411	00:00
1	29537.611328	1580.292236	00:00
2	27614.193359	1379.376953	00:00
3	25691.705078	1447.400391	00:00
4	24164.673828	1423.883423	00:00
5	22979.968750	1181.390015	00:00
6	21837.660156	1150.241577	00:00
7	20585.669922	1149.429443	00:00
8	19582.843750	1130.486206	00:00
9	18656.513672	1136.786377	00:00
10	18028.900391	1183.855469	00:00
11	17458.158203	1220.037476	00:00
12	16862.056641	1204.772095	00:00
13	16387.130859	1181.086670	00:00
14	15914.469727	1175.465820	00:00
15	15412.682617	1172.427368	00:00
16	14872.686523	1138.039551	00:00
17	14321.765625	1118.993286	00:00
18	13806.336914	1138.566040	00:00
19	13332.366211	1119.463257	00:00
20	12888.664062	1118.378662	00:00
21	12363.695312	1109.387085	00:00
22	11877.583008	1104.461548	00:00
23	11523.218750	1100.300537	00:00
24	11178.151367	1109.922119	00:00
25	10833.538086	1103.574463	00:00
26	10543.821289	1106.167847	00:00
27	10258.711914	1107.717651	00:00
28	9946.045898	1100.307251	00:00
29	9641.920898	1093.588257	00:00
30	9378.814453	1095.416016	00:00
31	9125.350586	1090.117310	00:00
32	8903.874023	1085.963135	00:00
33	8626.411133	1087.794922	00:00
34	8382.924805	1089.914551	00:00
35	8222.190430	1089.441284	00:00
36	7962.709961	1080.896240	00:00
37	7748.377930	1073.487915	00:00
38	7572.700195	1073.060303	00:00
39	7424.407715	1074.218140	00:00
40	7307.616211	1078.132324	00:00
41	7132.653809	1077.919922	00:00
42	7049.728516	1077.697388	00:00
43	6878.440918	1074.800903	00:00
44	6760.032715	1074.024658	00:00
45	6639.622559	1071.721191	00:00
46	6609.085938	1075.444824	00:00
47	6488.123047	1074.282593	00:00
48	6376.672363	1073.937500	00:00
49	6316.729492	1074.901123	00:00

Out[22]:

AETransformer(batch_size=10, hidden_layers=[512], latent_dim=50,
              model=<class 'pimmslearn.models.ae.Autoencoder'>,
              out_folder=Path('runs/scikit_interface'))

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [23]:

Copied!

df_imputed = model.transform(splits.train_X).stack()
df_imputed = model.transform(splits.train_X).stack()

In [24]:

Copied!





pred_val = splits.val_y.stack().to_frame("observed")
pred_val[model_selected] = df_imputed
val_metrics = pmm.calculte_metrics(pred_val, "observed")

fig, ax = plt.subplots(figsize=(8, 2))

ax, errors_binned = pmp.errors.plot_errors_by_median(
    pred=pred_val,
    target_col="observed",
    feat_medians=splits.train_X.median(),
    ax=ax,
    metric_name="MAE",
    palette=color_model_mapping,
)
pred_val = splits.val_y.stack().to_frame("observed")
pred_val[model_selected] = df_imputed
val_metrics = pmm.calculte_metrics(pred_val, "observed")

fig, ax = plt.subplots(figsize=(8, 2))

ax, errors_binned = pmp.errors.plot_errors_by_median(
    pred=pred_val,
    target_col="observed",
    feat_medians=splits.train_X.median(),
    ax=ax,
    metric_name="MAE",
    palette=color_model_mapping,
)

/Users/jl/.local/share/virtualenvs/msmu-c3VCzU_G/lib/python3.13/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_vals = vals.groupby(grouper)
/Users/jl/.local/share/virtualenvs/msmu-c3VCzU_G/lib/python3.13/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped_vals = vals.groupby(grouper)

In [25]:

Copied!

splits.to_long_format()
df_imputed = df_imputed.replace(splits.val_y).replace(splits.test_y)
splits.to_long_format()
df_imputed = df_imputed.replace(splits.val_y).replace(splits.test_y)

Check imputation results¶

In [26]:

Copied!





observed = df_imputed.loc[df.index].squeeze()
imputed = df_imputed.loc[df_imputed.index.difference(df.index)].squeeze()

fig, axes = plt.subplots(2, figsize=(8, 4))

min_max = pmp.data.get_min_max_iterable([observed, imputed])

label_template = "{method} (N={n:,d})"
ax, _ = pmp.data.plot_histogram_intensities(
    observed,
    ax=axes[0],
    min_max=min_max,
    label=label_template.format(
        method="measured",
        n=len(observed),
    ),
    color="grey",
    alpha=1,
)
_ = ax.legend()
ax, _ = pmp.data.plot_histogram_intensities(
    imputed,
    ax=axes[1],
    min_max=min_max,
    label=label_template.format(
        method=f"{model_selected} imputed",
        n=len(imputed),
    ),
    color=color_model_mapping[model_selected],
    alpha=1,
)
_ = ax.legend()
observed = df_imputed.loc[df.index].squeeze()
imputed = df_imputed.loc[df_imputed.index.difference(df.index)].squeeze()

fig, axes = plt.subplots(2, figsize=(8, 4))

min_max = pmp.data.get_min_max_iterable([observed, imputed])

label_template = "{method} (N={n:,d})"
ax, _ = pmp.data.plot_histogram_intensities(
    observed,
    ax=axes[0],
    min_max=min_max,
    label=label_template.format(
        method="measured",
        n=len(observed),
    ),
    color="grey",
    alpha=1,
)
_ = ax.legend()
ax, _ = pmp.data.plot_histogram_intensities(
    imputed,
    ax=axes[1],
    min_max=min_max,
    label=label_template.format(
        method=f"{model_selected} imputed",
        n=len(imputed),
    ),
    color=color_model_mapping[model_selected],
    alpha=1,
)
_ = ax.legend()

Push imputed data to mudata object¶

In [27]:

Copied!

mdata.mod["protein"].layers["imputed"] = df_imputed.unstack()
mdata.update()

mdata
mdata.mod["protein"].layers["imputed"] = df_imputed.unstack()
mdata.update()

mdata

/var/folders/pp/7ts5fh4x5hl81rnn895l34ph0000gn/T/ipykernel_17793/3088553350.py:1: ImplicitModificationWarning: Setting element `.layers['imputed']` of view, initializing view as actual.
  mdata.mod["protein"].layers["imputed"] = df_imputed.unstack()

Out[27]:

MuData object with n_obs × n_vars = 70 × 95925
  obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
  uns:	'_cmd'
  3 modalities
    psm:	70 x 74025
      obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
      var:	'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value', 'rt', 'calcmass', 'cell'
      uns:	'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
      varm:	'search_result', 'filter'
      layers:	'raw'
    peptide:	70 x 18846
      obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
      var:	'peptide', 'proteins', 'stripped_peptide', 'count_psm', 'PEP', 'q_value', 'protein_group', 'peptide_type'
      uns:	'level', 'decoy', 'filter', 'decoy_filter'
      varm:	'filter'
    protein:	70 x 3054
      obs:	'set', 'sample', 'cell', 'condition', 'sample_rna'
      var:	'count_psm', 'count_stripped_peptide', 'PEP', 'q_value'
      uns:	'level', 'decoy', 'filter', 'decoy_filter'
      varm:	'filter'
      layers:	'imputed'

Save MuData object¶

In [28]:

Copied!

mdata.write_h5mu("MSV000089280.h5mu")
mdata.write_h5mu("MSV000089280.h5mu")