Process Proteomics Data from Sage with msmu¶
In [1]:
Copied!
import msmu as mm
from pathlib import Path
import pandas as pd
import requests
import tarfile
import msmu as mm
from pathlib import Path
import pandas as pd
import requests
import tarfile
Determination of memory status is not supported on this platform, measuring for memoryleaks will never fail
In [2]:
Copied!
url = "https://github.com/bertis-informatics/msmu/releases/download/0.2.2/MSV000089280.tar.gz"
meta = "https://github.com/bertis-informatics/msmu/releases/download/0.2.2/meta.csv"
base_dir = Path(url).name.split(".")[0]
sage_idents = f"{base_dir}/results.sage.tsv"
sage_quants = f"{base_dir}/lfq.tsv"
r = requests.get(url)
r.raise_for_status()
with open(Path(url).name, "wb") as f:
f.write(r.content)
with tarfile.open(Path(url).name, "r:gz") as tar:
members = [m for m in tar.getmembers() if not Path(m.name).name.startswith("._")]
tar.extractall(members=members)
url = "https://github.com/bertis-informatics/msmu/releases/download/0.2.2/MSV000089280.tar.gz"
meta = "https://github.com/bertis-informatics/msmu/releases/download/0.2.2/meta.csv"
base_dir = Path(url).name.split(".")[0]
sage_idents = f"{base_dir}/results.sage.tsv"
sage_quants = f"{base_dir}/lfq.tsv"
r = requests.get(url)
r.raise_for_status()
with open(Path(url).name, "wb") as f:
f.write(r.content)
with tarfile.open(Path(url).name, "r:gz") as tar:
members = [m for m in tar.getmembers() if not Path(m.name).name.startswith("._")]
tar.extractall(members=members)
/var/folders/pp/7ts5fh4x5hl81rnn895l34ph0000gn/T/ipykernel_17793/2858270411.py:15: DeprecationWarning: Python 3.14 will, by default, filter extracted tar archives and reject files or modify their metadata. Use the filter argument to control this behavior. tar.extractall(members=members)
In [3]:
Copied!
mdata = mm.read_sage(identification_file=sage_idents, quantification_file=sage_quants, label="label_free")
mdata = mm.read_sage(identification_file=sage_idents, quantification_file=sage_quants, label="label_free")
INFO - Identification file loaded: (722655, 40) INFO - Quantification file loaded: (19530, 112) INFO - Decoy entries separated: (217399, 15)
In [4]:
Copied!
meta_df = pd.read_csv(meta)
meta_df = meta_df.set_index("sample_id") # set the index to match sample id in mdata.obs
mdata.obs = mdata.obs.join(meta_df)
mdata.push_obs() # update all modalities with the new obs data
meta_df = pd.read_csv(meta)
meta_df = meta_df.set_index("sample_id") # set the index to match sample id in mdata.obs
mdata.obs = mdata.obs.join(meta_df)
mdata.push_obs() # update all modalities with the new obs data
PSM¶
In [5]:
Copied!
mdata = mm.pp.add_filter(mdata, modality="psm", column="q_value", keep="lt", value=0.01)
mdata = mm.pp.add_filter(mdata, modality="psm", column="proteins", keep="not_contains", value="contam_")
mdata = mm.pp.apply_filter(mdata, modality="psm")
mdata
mdata = mm.pp.add_filter(mdata, modality="psm", column="q_value", keep="lt", value=0.01)
mdata = mm.pp.add_filter(mdata, modality="psm", column="proteins", keep="not_contains", value="contam_")
mdata = mm.pp.apply_filter(mdata, modality="psm")
mdata
Out[5]:
MuData object with n_obs × n_vars = 106 × 267797
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
uns: '_cmd'
2 modalities
psm: 106 x 248267
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
var: 'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value', 'rt', 'calcmass'
uns: 'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
varm: 'search_result', 'filter'
peptide: 106 x 19530
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
uns: 'level'
Peptide¶
In [6]:
Copied!
mdata = mm.pp.to_peptide(mdata)
mdata
mdata = mm.pp.to_peptide(mdata)
mdata
INFO - Peptide-level identifications: 25260 (19769 at 1% FDR)
Using existing peptide quantification data.
Out[6]:
MuData object with n_obs × n_vars = 106 × 248267
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
uns: '_cmd'
2 modalities
psm: 106 x 248267
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
var: 'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value', 'rt', 'calcmass'
uns: 'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
varm: 'search_result', 'filter'
peptide: 106 x 25260
var: 'peptide', 'proteins', 'stripped_peptide', 'count_psm', 'PEP', 'q_value'
uns: 'level', 'decoy'
In [7]:
Copied!
mdata["psm"].var["cell"] = mdata["psm"].var["filename"].map(mdata.obs["cell"])
mdata["psm"].uns["decoy"]["cell"] = mdata["psm"].uns["decoy"]["filename"].map(mdata.obs["cell"])
mdata = mm.pp.add_filter(mdata, modality="psm", column="cell", keep="contains", value="C10|SVEC")
mdata = mm.pp.apply_filter(mdata, modality="psm")
mdata = mdata[mdata.obs["cell"].isin(["C10", "SVEC"])].copy()
mdata
mdata["psm"].var["cell"] = mdata["psm"].var["filename"].map(mdata.obs["cell"])
mdata["psm"].uns["decoy"]["cell"] = mdata["psm"].uns["decoy"]["filename"].map(mdata.obs["cell"])
mdata = mm.pp.add_filter(mdata, modality="psm", column="cell", keep="contains", value="C10|SVEC")
mdata = mm.pp.apply_filter(mdata, modality="psm")
mdata = mdata[mdata.obs["cell"].isin(["C10", "SVEC"])].copy()
mdata
Out[7]:
MuData object with n_obs × n_vars = 70 × 99285
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
uns: '_cmd'
2 modalities
psm: 70 x 74025
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
var: 'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value', 'rt', 'calcmass', 'cell'
uns: 'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
varm: 'search_result', 'filter'
peptide: 70 x 25260
var: 'peptide', 'proteins', 'stripped_peptide', 'count_psm', 'PEP', 'q_value'
uns: 'level', 'decoy'
Filtering - peptide¶
In [8]:
Copied!
mdata = mm.pp.add_filter(mdata, modality="peptide", column="q_value", keep="lt", value=0.01)
mdata = mm.pp.apply_filter(mdata, modality="peptide")
mdata.mod["peptide"] = mdata["peptide"][:, mdata["peptide"].to_df().dropna(axis=1, how="all").columns]
mdata.update()
mdata
mdata = mm.pp.add_filter(mdata, modality="peptide", column="q_value", keep="lt", value=0.01)
mdata = mm.pp.apply_filter(mdata, modality="peptide")
mdata.mod["peptide"] = mdata["peptide"][:, mdata["peptide"].to_df().dropna(axis=1, how="all").columns]
mdata.update()
mdata
Out[8]:
MuData object with n_obs × n_vars = 70 × 92871
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
uns: '_cmd'
2 modalities
psm: 70 x 74025
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
var: 'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value', 'rt', 'calcmass', 'cell'
uns: 'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
varm: 'search_result', 'filter'
peptide: 70 x 18846
var: 'peptide', 'proteins', 'stripped_peptide', 'count_psm', 'PEP', 'q_value'
uns: 'level', 'decoy', 'filter', 'decoy_filter'
varm: 'filter'
Normalization¶
Here, we log2 transform and normalize the data at the peptide level.
Median centering normalization is applied using mm.pp.normalize() function.
In [ ]:
Copied!
mdata["psm"].layers["raw"] = mdata["psm"].X.copy()
mdata = mm.pp.log2_transform(mdata, modality="peptide")
mdata = mm.pp.normalize(mdata, modality="peptide", method="median")
mdata["psm"].layers["raw"] = mdata["psm"].X.copy()
mdata = mm.pp.log2_transform(mdata, modality="peptide")
mdata = mm.pp.normalize(mdata, modality="peptide", method="median")
/Users/jl/Scripts/msmu/msmu/_preprocessing/_normalise.py:29: ImplicitModificationWarning: Modifying `X` on a view results in data being overridden mdata[modality].X = log2_arr /Users/jl/Scripts/msmu/msmu/_preprocessing/_normalise.py:123: ImplicitModificationWarning: Modifying `X` on a view results in data being overridden mdata.mod[modality].X = normalised_arr
Protein inference¶
You can infer protein-level data from peptide-level data using the mm.pp.infer_protein() function.
In [10]:
Copied!
mdata = mm.pp.infer_protein(mdata)
mdata = mm.pp.infer_protein(mdata)
INFO - Starting protein inference INFO - Initial proteins: 4268 INFO - Removed indistinguishable: 197 INFO - Removed subsettable: 263 INFO - Removed subsumable: 19 INFO - Total protein groups: 3789
Protein¶
In [11]:
Copied!
mdata = mm.pp.to_protein(mdata, top_n=3, rank_method="total_intensity")
mdata = mm.pp.to_protein(mdata, top_n=3, rank_method="total_intensity")
INFO - Ranking features by 'total_intensity' to select top 3 features. INFO - Protein-level identifications : 3595 (3054 at 1% FDR)
Filtering - protein¶
In [12]:
Copied!
mdata = mm.pp.add_filter(mdata, modality="protein", column="q_value", keep="lt", value=0.01)
mdata = mm.pp.apply_filter(mdata, modality="protein")
mdata.mod["protein"] = mdata["protein"][:, mdata["protein"].to_df().dropna(axis=1, how="all").columns]
mdata.update()
mdata
mdata = mm.pp.add_filter(mdata, modality="protein", column="q_value", keep="lt", value=0.01)
mdata = mm.pp.apply_filter(mdata, modality="protein")
mdata.mod["protein"] = mdata["protein"][:, mdata["protein"].to_df().dropna(axis=1, how="all").columns]
mdata.update()
mdata
Out[12]:
MuData object with n_obs × n_vars = 70 × 95925
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
uns: '_cmd'
3 modalities
psm: 70 x 74025
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
var: 'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value', 'rt', 'calcmass', 'cell'
uns: 'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
varm: 'search_result', 'filter'
layers: 'raw'
peptide: 70 x 18846
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
var: 'peptide', 'proteins', 'stripped_peptide', 'count_psm', 'PEP', 'q_value', 'protein_group', 'peptide_type'
uns: 'level', 'decoy', 'filter', 'decoy_filter'
varm: 'filter'
protein: 70 x 3054
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
var: 'count_psm', 'count_stripped_peptide', 'PEP', 'q_value'
uns: 'level', 'decoy', 'filter', 'decoy_filter'
varm: 'filter'
Imputation¶
In [13]:
Copied!
import matplotlib.pyplot as plt
import pandas as pd
import pimmslearn.plotting as pmp
import pimmslearn.sampling as pms
import pimmslearn.models as pmm
from pimmslearn.plotting.defaults import color_model_mapping
from pimmslearn.sklearn.ae_transformer import AETransformer
from pimmslearn.sklearn.cf_transformer import CollaborativeFilteringTransformer
pmp.make_large_descriptors("8")
index_name: str = "Sample ID"
column_name: str = "protein group"
frac_non_train: float = 0.1
frac_mnar: float = 0.05
random_state: int = 42
import matplotlib.pyplot as plt
import pandas as pd
import pimmslearn.plotting as pmp
import pimmslearn.sampling as pms
import pimmslearn.models as pmm
from pimmslearn.plotting.defaults import color_model_mapping
from pimmslearn.sklearn.ae_transformer import AETransformer
from pimmslearn.sklearn.cf_transformer import CollaborativeFilteringTransformer
pmp.make_large_descriptors("8")
index_name: str = "Sample ID"
column_name: str = "protein group"
frac_non_train: float = 0.1
frac_mnar: float = 0.05
random_state: int = 42
In [14]:
Copied!
df = mdata["protein"].to_df()
df.index.name = "Sample ID"
df.columns.name = "protein group"
df = mdata["protein"].to_df()
df.index.name = "Sample ID"
df.columns.name = "protein group"
Check missing value pattern¶
In [15]:
Copied!
ax = pmp.data.plot_feat_median_over_prop_missing(data=df, type="boxplot")
ax = pmp.data.plot_feat_median_over_prop_missing(data=df, type="boxplot")
/Users/jl/.local/share/virtualenvs/msmu-c3VCzU_G/lib/python3.13/site-packages/pimmslearn/plotting/data.py:327: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` ax = ax[0] # returned series due to by argument?
Robust missing value imputation¶
In [16]:
Copied!
df = df.stack().to_frame("intensity")
splits, thresholds, fake_na_mcar, fake_na_mnar = pms.sample_mnar_mcar(
df_long=df,
frac_non_train=frac_non_train,
frac_mnar=frac_mnar,
random_state=random_state,
)
splits = pms.check_split_integrity(splits)
df = df.stack().to_frame("intensity")
splits, thresholds, fake_na_mcar, fake_na_mnar = pms.sample_mnar_mcar(
df_long=df,
frac_non_train=frac_non_train,
frac_mnar=frac_mnar,
random_state=random_state,
)
splits = pms.check_split_integrity(splits)
/Users/jl/.local/share/virtualenvs/msmu-c3VCzU_G/lib/python3.13/site-packages/pimmslearn/sampling.py:209: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead loc=float(quantile_frac), /Users/jl/.local/share/virtualenvs/msmu-c3VCzU_G/lib/python3.13/site-packages/pimmslearn/sampling.py:210: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead scale=float(0.3 * df_long.std()),
In [17]:
Copied!
cf_model = CollaborativeFilteringTransformer(
target_column="intensity",
sample_column="Sample ID",
item_column="protein group",
out_folder="runs/scikit_interface",
)
cf_model.fit(splits.train_X, splits.val_y, cuda=False, epochs_max=20)
cf_model = CollaborativeFilteringTransformer(
target_column="intensity",
sample_column="Sample ID",
item_column="protein group",
out_folder="runs/scikit_interface",
)
cf_model.fit(splits.train_X, splits.val_y, cuda=False, epochs_max=20)
suggested_lr.valley = 0.00525
| epoch | train_loss | valid_loss | time |
|---|---|---|---|
| 0 | 6.320165 | 6.174103 | 00:00 |
| 1 | 5.522466 | 4.395256 | 00:00 |
| 2 | 3.587925 | 2.049297 | 00:00 |
| 3 | 2.431448 | 1.833395 | 00:00 |
| 4 | 1.831739 | 1.518024 | 00:00 |
| 5 | 1.435816 | 1.355048 | 00:00 |
| 6 | 1.189103 | 1.294942 | 00:00 |
| 7 | 1.034226 | 1.249249 | 00:00 |
| 8 | 0.919461 | 1.232482 | 00:00 |
| 9 | 0.839021 | 1.222002 | 00:00 |
| 10 | 0.773871 | 1.213310 | 00:00 |
| 11 | 0.722318 | 1.214855 | 00:00 |
No improvement since epoch 10: early stopping
Out[17]:
CollaborativeFilteringTransformer(item_column='protein group',
out_folder=Path('runs/scikit_interface'),
sample_column='Sample ID',
target_column='intensity')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CollaborativeFilteringTransformer(item_column='protein group',
out_folder=Path('runs/scikit_interface'),
sample_column='Sample ID',
target_column='intensity')In [18]:
Copied!
df_imputed = cf_model.transform(df).unstack()
assert df_imputed.isna().sum().sum() == 0
df_imputed = cf_model.transform(df).unstack()
assert df_imputed.isna().sum().sum() == 0
Check imputation results¶
In [19]:
Copied!
df_imputed = df_imputed.stack() # long-format
observed = df_imputed.loc[df.index]
imputed = df_imputed.loc[df_imputed.index.difference(df.index)]
df_imputed = df_imputed.unstack() # back to wide-format
# some checks
assert len(df) == len(observed)
assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)
fig, axes = plt.subplots(2, figsize=(8, 4))
min_max = pmp.data.get_min_max_iterable([observed, imputed])
label_template = "{method} (N={n:,d})"
ax, _ = pmp.data.plot_histogram_intensities(
observed,
ax=axes[0],
min_max=min_max,
label=label_template.format(
method="measured",
n=len(observed),
),
color="grey",
alpha=1,
)
_ = ax.legend()
ax, _ = pmp.data.plot_histogram_intensities(
imputed,
ax=axes[1],
min_max=min_max,
label=label_template.format(
method="CF imputed",
n=len(imputed),
),
color=color_model_mapping["CF"],
alpha=1,
)
_ = ax.legend()
df_imputed = df_imputed.stack() # long-format
observed = df_imputed.loc[df.index]
imputed = df_imputed.loc[df_imputed.index.difference(df.index)]
df_imputed = df_imputed.unstack() # back to wide-format
# some checks
assert len(df) == len(observed)
assert df_imputed.shape[0] * df_imputed.shape[1] == len(imputed) + len(observed)
fig, axes = plt.subplots(2, figsize=(8, 4))
min_max = pmp.data.get_min_max_iterable([observed, imputed])
label_template = "{method} (N={n:,d})"
ax, _ = pmp.data.plot_histogram_intensities(
observed,
ax=axes[0],
min_max=min_max,
label=label_template.format(
method="measured",
n=len(observed),
),
color="grey",
alpha=1,
)
_ = ax.legend()
ax, _ = pmp.data.plot_histogram_intensities(
imputed,
ax=axes[1],
min_max=min_max,
label=label_template.format(
method="CF imputed",
n=len(imputed),
),
color=color_model_mapping["CF"],
alpha=1,
)
_ = ax.legend()
In [20]:
Copied!
splits.to_wide_format()
splits.val_y = pd.DataFrame(pd.NA, index=splits.train_X.index, columns=splits.train_X.columns).fillna(splits.val_y)
splits.to_wide_format()
splits.val_y = pd.DataFrame(pd.NA, index=splits.train_X.index, columns=splits.train_X.columns).fillna(splits.val_y)
Imputation with denoising autoencoder¶
In [21]:
Copied!
model_selected = "DAE"
model = AETransformer(
model=model_selected,
hidden_layers=[512],
latent_dim=50,
out_folder="runs/scikit_interface",
batch_size=10,
)
model_selected = "DAE"
model = AETransformer(
model=model_selected,
hidden_layers=[512],
latent_dim=50,
out_folder="runs/scikit_interface",
batch_size=10,
)
In [22]:
Copied!
model.fit(splits.train_X, splits.val_y, epochs_max=50, cuda=False)
model.fit(splits.train_X, splits.val_y, epochs_max=50, cuda=False)
| epoch | train_loss | valid_loss | time |
|---|---|---|---|
| 0 | 31082.289062 | 1663.821411 | 00:00 |
| 1 | 29537.611328 | 1580.292236 | 00:00 |
| 2 | 27614.193359 | 1379.376953 | 00:00 |
| 3 | 25691.705078 | 1447.400391 | 00:00 |
| 4 | 24164.673828 | 1423.883423 | 00:00 |
| 5 | 22979.968750 | 1181.390015 | 00:00 |
| 6 | 21837.660156 | 1150.241577 | 00:00 |
| 7 | 20585.669922 | 1149.429443 | 00:00 |
| 8 | 19582.843750 | 1130.486206 | 00:00 |
| 9 | 18656.513672 | 1136.786377 | 00:00 |
| 10 | 18028.900391 | 1183.855469 | 00:00 |
| 11 | 17458.158203 | 1220.037476 | 00:00 |
| 12 | 16862.056641 | 1204.772095 | 00:00 |
| 13 | 16387.130859 | 1181.086670 | 00:00 |
| 14 | 15914.469727 | 1175.465820 | 00:00 |
| 15 | 15412.682617 | 1172.427368 | 00:00 |
| 16 | 14872.686523 | 1138.039551 | 00:00 |
| 17 | 14321.765625 | 1118.993286 | 00:00 |
| 18 | 13806.336914 | 1138.566040 | 00:00 |
| 19 | 13332.366211 | 1119.463257 | 00:00 |
| 20 | 12888.664062 | 1118.378662 | 00:00 |
| 21 | 12363.695312 | 1109.387085 | 00:00 |
| 22 | 11877.583008 | 1104.461548 | 00:00 |
| 23 | 11523.218750 | 1100.300537 | 00:00 |
| 24 | 11178.151367 | 1109.922119 | 00:00 |
| 25 | 10833.538086 | 1103.574463 | 00:00 |
| 26 | 10543.821289 | 1106.167847 | 00:00 |
| 27 | 10258.711914 | 1107.717651 | 00:00 |
| 28 | 9946.045898 | 1100.307251 | 00:00 |
| 29 | 9641.920898 | 1093.588257 | 00:00 |
| 30 | 9378.814453 | 1095.416016 | 00:00 |
| 31 | 9125.350586 | 1090.117310 | 00:00 |
| 32 | 8903.874023 | 1085.963135 | 00:00 |
| 33 | 8626.411133 | 1087.794922 | 00:00 |
| 34 | 8382.924805 | 1089.914551 | 00:00 |
| 35 | 8222.190430 | 1089.441284 | 00:00 |
| 36 | 7962.709961 | 1080.896240 | 00:00 |
| 37 | 7748.377930 | 1073.487915 | 00:00 |
| 38 | 7572.700195 | 1073.060303 | 00:00 |
| 39 | 7424.407715 | 1074.218140 | 00:00 |
| 40 | 7307.616211 | 1078.132324 | 00:00 |
| 41 | 7132.653809 | 1077.919922 | 00:00 |
| 42 | 7049.728516 | 1077.697388 | 00:00 |
| 43 | 6878.440918 | 1074.800903 | 00:00 |
| 44 | 6760.032715 | 1074.024658 | 00:00 |
| 45 | 6639.622559 | 1071.721191 | 00:00 |
| 46 | 6609.085938 | 1075.444824 | 00:00 |
| 47 | 6488.123047 | 1074.282593 | 00:00 |
| 48 | 6376.672363 | 1073.937500 | 00:00 |
| 49 | 6316.729492 | 1074.901123 | 00:00 |
Out[22]:
AETransformer(batch_size=10, hidden_layers=[512], latent_dim=50,
model=<class 'pimmslearn.models.ae.Autoencoder'>,
out_folder=Path('runs/scikit_interface'))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
AETransformer(batch_size=10, hidden_layers=[512], latent_dim=50,
model=<class 'pimmslearn.models.ae.Autoencoder'>,
out_folder=Path('runs/scikit_interface'))In [23]:
Copied!
df_imputed = model.transform(splits.train_X).stack()
df_imputed = model.transform(splits.train_X).stack()
In [24]:
Copied!
pred_val = splits.val_y.stack().to_frame("observed")
pred_val[model_selected] = df_imputed
val_metrics = pmm.calculte_metrics(pred_val, "observed")
fig, ax = plt.subplots(figsize=(8, 2))
ax, errors_binned = pmp.errors.plot_errors_by_median(
pred=pred_val,
target_col="observed",
feat_medians=splits.train_X.median(),
ax=ax,
metric_name="MAE",
palette=color_model_mapping,
)
pred_val = splits.val_y.stack().to_frame("observed")
pred_val[model_selected] = df_imputed
val_metrics = pmm.calculte_metrics(pred_val, "observed")
fig, ax = plt.subplots(figsize=(8, 2))
ax, errors_binned = pmp.errors.plot_errors_by_median(
pred=pred_val,
target_col="observed",
feat_medians=splits.train_X.median(),
ax=ax,
metric_name="MAE",
palette=color_model_mapping,
)
/Users/jl/.local/share/virtualenvs/msmu-c3VCzU_G/lib/python3.13/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_vals = vals.groupby(grouper) /Users/jl/.local/share/virtualenvs/msmu-c3VCzU_G/lib/python3.13/site-packages/seaborn/categorical.py:641: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_vals = vals.groupby(grouper)
In [25]:
Copied!
splits.to_long_format()
df_imputed = df_imputed.replace(splits.val_y).replace(splits.test_y)
splits.to_long_format()
df_imputed = df_imputed.replace(splits.val_y).replace(splits.test_y)
Check imputation results¶
In [26]:
Copied!
observed = df_imputed.loc[df.index].squeeze()
imputed = df_imputed.loc[df_imputed.index.difference(df.index)].squeeze()
fig, axes = plt.subplots(2, figsize=(8, 4))
min_max = pmp.data.get_min_max_iterable([observed, imputed])
label_template = "{method} (N={n:,d})"
ax, _ = pmp.data.plot_histogram_intensities(
observed,
ax=axes[0],
min_max=min_max,
label=label_template.format(
method="measured",
n=len(observed),
),
color="grey",
alpha=1,
)
_ = ax.legend()
ax, _ = pmp.data.plot_histogram_intensities(
imputed,
ax=axes[1],
min_max=min_max,
label=label_template.format(
method=f"{model_selected} imputed",
n=len(imputed),
),
color=color_model_mapping[model_selected],
alpha=1,
)
_ = ax.legend()
observed = df_imputed.loc[df.index].squeeze()
imputed = df_imputed.loc[df_imputed.index.difference(df.index)].squeeze()
fig, axes = plt.subplots(2, figsize=(8, 4))
min_max = pmp.data.get_min_max_iterable([observed, imputed])
label_template = "{method} (N={n:,d})"
ax, _ = pmp.data.plot_histogram_intensities(
observed,
ax=axes[0],
min_max=min_max,
label=label_template.format(
method="measured",
n=len(observed),
),
color="grey",
alpha=1,
)
_ = ax.legend()
ax, _ = pmp.data.plot_histogram_intensities(
imputed,
ax=axes[1],
min_max=min_max,
label=label_template.format(
method=f"{model_selected} imputed",
n=len(imputed),
),
color=color_model_mapping[model_selected],
alpha=1,
)
_ = ax.legend()
Push imputed data to mudata object¶
In [27]:
Copied!
mdata.mod["protein"].layers["imputed"] = df_imputed.unstack()
mdata.update()
mdata
mdata.mod["protein"].layers["imputed"] = df_imputed.unstack()
mdata.update()
mdata
/var/folders/pp/7ts5fh4x5hl81rnn895l34ph0000gn/T/ipykernel_17793/3088553350.py:1: ImplicitModificationWarning: Setting element `.layers['imputed']` of view, initializing view as actual. mdata.mod["protein"].layers["imputed"] = df_imputed.unstack()
Out[27]:
MuData object with n_obs × n_vars = 70 × 95925
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
uns: '_cmd'
3 modalities
psm: 70 x 74025
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
var: 'proteins', 'peptide', 'stripped_peptide', 'filename', 'scan_num', 'charge', 'peptide_length', 'missed_cleavages', 'semi_enzymatic', 'contaminant', 'PEP', 'q_value', 'rt', 'calcmass', 'cell'
uns: 'level', 'search_engine', 'quantification', 'label', 'acquisition', 'identification_file', 'quantification_file', 'decoy', 'filter', 'decoy_filter'
varm: 'search_result', 'filter'
layers: 'raw'
peptide: 70 x 18846
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
var: 'peptide', 'proteins', 'stripped_peptide', 'count_psm', 'PEP', 'q_value', 'protein_group', 'peptide_type'
uns: 'level', 'decoy', 'filter', 'decoy_filter'
varm: 'filter'
protein: 70 x 3054
obs: 'set', 'sample', 'cell', 'condition', 'sample_rna'
var: 'count_psm', 'count_stripped_peptide', 'PEP', 'q_value'
uns: 'level', 'decoy', 'filter', 'decoy_filter'
varm: 'filter'
layers: 'imputed'
Save MuData object¶
In [28]:
Copied!
mdata.write_h5mu("MSV000089280.h5mu")
mdata.write_h5mu("MSV000089280.h5mu")