Process scRNA-seq Data with msmu¶
In [1]:
Copied!
from pathlib import Path
import pandas as pd
import requests
import scanpy as sc
import tarfile
from pathlib import Path
import pandas as pd
import requests
import scanpy as sc
import tarfile
In [2]:
Copied!
url = "https://github.com/bertis-informatics/msmu/releases/download/0.2.2/GSE201575.tar.gz"
meta = "https://github.com/bertis-informatics/msmu/releases/download/0.2.2/meta.csv"
base_dir = Path(url).name.split(".")[0]
r = requests.get(url)
r.raise_for_status()
with open(Path(url).name, "wb") as f:
f.write(r.content)
with tarfile.open(Path(url).name, "r:gz") as tar:
members = [m for m in tar.getmembers() if not Path(m.name).name.startswith("._")]
tar.extractall(members=members)
url = "https://github.com/bertis-informatics/msmu/releases/download/0.2.2/GSE201575.tar.gz"
meta = "https://github.com/bertis-informatics/msmu/releases/download/0.2.2/meta.csv"
base_dir = Path(url).name.split(".")[0]
r = requests.get(url)
r.raise_for_status()
with open(Path(url).name, "wb") as f:
f.write(r.content)
with tarfile.open(Path(url).name, "r:gz") as tar:
members = [m for m in tar.getmembers() if not Path(m.name).name.startswith("._")]
tar.extractall(members=members)
/var/folders/pp/7ts5fh4x5hl81rnn895l34ph0000gn/T/ipykernel_17737/3812210948.py:13: DeprecationWarning: Python 3.14 will, by default, filter extracted tar archives and reject files or modify their metadata. Use the filter argument to control this behavior. tar.extractall(members=members)
Read count matrix¶
In [3]:
Copied!
path_list = Path(base_dir).glob("*.txt.gz")
path_list = sorted(path_list)
adata_list = []
for p in path_list:
a = sc.read_text(p, delimiter="\t", first_column_names=True).T
a.obs.index = [p.stem.split(".")[0].split("_")[1]]
a.obs["filename"] = [p.stem]
adata_list.append(a)
adata = sc.concat(adata_list)
path_list = Path(base_dir).glob("*.txt.gz")
path_list = sorted(path_list)
adata_list = []
for p in path_list:
a = sc.read_text(p, delimiter="\t", first_column_names=True).T
a.obs.index = [p.stem.split(".")[0].split("_")[1]]
a.obs["filename"] = [p.stem]
adata_list.append(a)
adata = sc.concat(adata_list)
Add metadata and filter samples on use¶
In [4]:
Copied!
meta_df = pd.read_csv(meta)
meta_df = meta_df.dropna()
meta_df.index = meta_df["sample_rna"].values
adata.obs = adata.obs.merge(meta_df, left_index=True, right_index=True, how="left")
adata = adata[adata.obs.dropna().index.to_list()].copy()
adata.obs.index = adata.obs["sample_id"].values
adata
meta_df = pd.read_csv(meta)
meta_df = meta_df.dropna()
meta_df.index = meta_df["sample_rna"].values
adata.obs = adata.obs.merge(meta_df, left_index=True, right_index=True, how="left")
adata = adata[adata.obs.dropna().index.to_list()].copy()
adata.obs.index = adata.obs["sample_id"].values
adata
Out[4]:
AnnData object with n_obs × n_vars = 70 × 40207
obs: 'filename', 'set', 'sample_id', 'sample', 'cell', 'condition', 'sample_rna'
Filtering and normalization¶
In [5]:
Copied!
adata.layers["counts"] = adata.X.copy()
sc.pp.filter_genes(adata, min_cells=3, inplace=True)
sc.pp.filter_cells(adata, min_genes=200, inplace=True)
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
adata
adata.layers["counts"] = adata.X.copy()
sc.pp.filter_genes(adata, min_cells=3, inplace=True)
sc.pp.filter_cells(adata, min_genes=200, inplace=True)
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
adata
Out[5]:
AnnData object with n_obs × n_vars = 70 × 13451
obs: 'filename', 'set', 'sample_id', 'sample', 'cell', 'condition', 'sample_rna', 'n_genes'
var: 'n_cells'
uns: 'log1p'
layers: 'counts'
Save AnnData object¶
In [6]:
Copied!
adata.write_h5ad("GSE201575.h5ad")
adata.write_h5ad("GSE201575.h5ad")