Query individual files#
scRNA-seq data integration is the process of analyzing data from several scRNA sequencing experiments to uncover common or distinct biological insights and patterns.
Here, weโll demonstrate how to fetch two scRNA-seq datasets by registered metadata such as cell types to finally integrate them.
import lamindb as ln
import lnschema_bionty as lb
import anndata as ad
๐ก loaded instance: testuser1/test-scrna (lamindb 0.54.3)
ln.track()
๐ก notebook imports: anndata==0.9.2 lamindb==0.54.3 lnschema_bionty==0.31.2
๐ก Transform(id='agayZTonayqAz8', name='Query individual files', short_name='scrna2', version='0', type=notebook, updated_at=2023-09-29 14:46:36, created_by_id='DzTjkKse')
๐ก Run(id='kJNEKVsuf6S5JDofFIgB', run_at=2023-09-29 14:46:36, transform_id='agayZTonayqAz8', created_by_id='DzTjkKse')
Access #
Query files by provenance metadata#
users = ln.User.lookup()
ln.Transform.filter(created_by=users.testuser1).search("scrna")
id | __ratio__ | |
---|---|---|
name | ||
scRNA-seq | Nv48yAceNSh8z8 | 90.0 |
Append a new batch of data | ManDYgmftZ8Cz8 | 36.0 |
Query individual files | agayZTonayqAz8 | 36.0 |
transform = ln.Transform.filter(id="Nv48yAceNSh8z8").one()
ln.File.filter(transform=transform).df()
storage_id | key | suffix | accessor | description | version | size | hash | hash_type | transform_id | run_id | initial_version_id | updated_at | created_by_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||||||
nV0w72HVEfJeK6lgb7BO | 975nKuX0 | None | .h5ad | AnnData | Conde22 | None | 28049505 | WEFcMZxJNmMiUOFrcSTaig | md5 | Nv48yAceNSh8z8 | Pd5UweAXC3cCH1aOMdr1 | None | 2023-09-29 14:45:51 | DzTjkKse |
Query files based on biological metadata#
assays = lb.ExperimentalFactor.lookup()
species = lb.Species.lookup()
cell_types = lb.CellType.lookup()
query = ln.File.filter(
experimental_factors=assays.single_cell_rna_sequencing,
species=species.human,
cell_types=cell_types.gamma_delta_t_cell,
)
query.df()
storage_id | key | suffix | accessor | description | version | size | hash | hash_type | transform_id | run_id | initial_version_id | updated_at | created_by_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
id | ||||||||||||||
Vf6s6oe8cTQ8oqMCCjeT | 975nKuX0 | None | .h5ad | AnnData | 10x reference adata | None | 660792 | a2V0IgOjMRHsCeZH169UOQ | md5 | ManDYgmftZ8Cz8 | 0eCAXKvC5AGTwRu42M0X | None | 2023-09-29 14:46:24 | DzTjkKse |
nV0w72HVEfJeK6lgb7BO | 975nKuX0 | None | .h5ad | AnnData | Conde22 | None | 28049505 | WEFcMZxJNmMiUOFrcSTaig | md5 | Nv48yAceNSh8z8 | Pd5UweAXC3cCH1aOMdr1 | None | 2023-09-29 14:45:51 | DzTjkKse |
Transform #
Compare gene sets#
Get file objects:
query = ln.File.filter()
file1, file2 = query.list()
file1.describe()
File(id='nV0w72HVEfJeK6lgb7BO', suffix='.h5ad', accessor='AnnData', description='Conde22', size=28049505, hash='WEFcMZxJNmMiUOFrcSTaig', hash_type='md5', updated_at=2023-09-29 14:45:51)
Provenance:
๐๏ธ storage: Storage(id='975nKuX0', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-09-29 14:44:56, created_by_id='DzTjkKse')
๐ transform: Transform(id='Nv48yAceNSh8z8', name='scRNA-seq', short_name='scrna', version='0', type='notebook', updated_at=2023-09-29 14:45:51, created_by_id='DzTjkKse')
๐ฃ run: Run(id='Pd5UweAXC3cCH1aOMdr1', run_at=2023-09-29 14:44:58, transform_id='Nv48yAceNSh8z8', created_by_id='DzTjkKse')
๐ค created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-09-29 14:44:56)
โฌ๏ธ input_of (core.Run): ['2023-09-29 14:45:55']
Features:
var: FeatureSet(id='EJQhGDAUMVCAW7a878Is', n=36503, type='number', registry='bionty.Gene', hash='dnRexHCtxtmOU81_EpoJ', updated_at=2023-09-29 14:45:40, modality_id='Tkw6vO00', created_by_id='DzTjkKse')
'KRT4', 'LHX5-AS1', 'OPN1LW', 'STYX', 'CCDC158', 'None', 'None', 'MKNK1-AS1', 'None', 'None', 'LNPEP', 'LINC02485', 'None', 'None', 'IGHEP2', 'CYFIP1', 'A4GNT', 'SLC14A2', 'None', 'None', ...
obs: FeatureSet(id='KEEZXO20pmTjLPROaTDE', n=4, registry='core.Feature', hash='NUCABLKrrAle7o2cv7hj', updated_at=2023-09-29 14:45:45, modality_id='jUAc2M1C', created_by_id='DzTjkKse')
๐ donor (12, core.ULabel): '621B', '640C', 'D496', 'A52', 'A36', '582C', 'A29', 'D503', 'A37', 'A31', ...
๐ cell_type (32, bionty.CellType): 'progenitor cell', 'T follicular helper cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'dendritic cell, human', 'CD8-positive, alpha-beta memory T cell, CD45RO-positive', 'classical monocyte', 'regulatory T cell', 'group 3 innate lymphoid cell', 'CD4-positive helper T cell', 'non-classical monocyte', ...
๐ assay (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 5' v2', '10x 3' v3', '10x 5' v1'
๐ tissue (17, bionty.Tissue): 'mesenteric lymph node', 'lung', 'sigmoid colon', 'thymus', 'lamina propria', 'bone marrow', 'ileum', 'jejunal epithelium', 'spleen', 'thoracic lymph node', ...
Labels:
๐ท๏ธ species (1, bionty.Species): 'human'
๐ท๏ธ tissues (17, bionty.Tissue): 'mesenteric lymph node', 'lung', 'sigmoid colon', 'thymus', 'lamina propria', 'bone marrow', 'ileum', 'jejunal epithelium', 'spleen', 'thoracic lymph node', ...
๐ท๏ธ cell_types (32, bionty.CellType): 'progenitor cell', 'T follicular helper cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'dendritic cell, human', 'CD8-positive, alpha-beta memory T cell, CD45RO-positive', 'classical monocyte', 'regulatory T cell', 'group 3 innate lymphoid cell', 'CD4-positive helper T cell', 'non-classical monocyte', ...
๐ท๏ธ experimental_factors (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 5' v2', '10x 3' v3', '10x 5' v1'
๐ท๏ธ ulabels (12, core.ULabel): '621B', '640C', 'D496', 'A52', 'A36', '582C', 'A29', 'D503', 'A37', 'A31', ...
file1.view_flow()
file2.describe()
File(id='Vf6s6oe8cTQ8oqMCCjeT', suffix='.h5ad', accessor='AnnData', description='10x reference adata', size=660792, hash='a2V0IgOjMRHsCeZH169UOQ', hash_type='md5', updated_at=2023-09-29 14:46:24)
Provenance:
๐๏ธ storage: Storage(id='975nKuX0', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-09-29 14:44:56, created_by_id='DzTjkKse')
๐ transform: Transform(id='ManDYgmftZ8Cz8', name='Append a new batch of data', short_name='scrna1', version='0', type='notebook', updated_at=2023-09-29 14:46:25, created_by_id='DzTjkKse')
๐ฃ run: Run(id='0eCAXKvC5AGTwRu42M0X', run_at=2023-09-29 14:45:55, transform_id='ManDYgmftZ8Cz8', created_by_id='DzTjkKse')
๐ค created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-09-29 14:44:56)
Features:
var: FeatureSet(id='9ZyNwE85EcgUEwAEUkkJ', n=754, type='number', registry='bionty.Gene', hash='WMDxN7253SdzGwmznV5d', updated_at=2023-09-29 14:46:24, modality_id='Tkw6vO00', created_by_id='DzTjkKse')
'COMMD5', 'MTPN', 'TNFRSF4', 'ANKRD12', 'IL32', 'MATK', 'EIF3G', 'JAML', 'SERPINF1', 'MARCKSL1', 'COA1', 'IGHA1', 'ATP5MF', 'TXN', 'UQCRC1', 'HNRNPK', 'CRIP1', 'SDHC', 'PSMC5', 'S1PR4', ...
obs: FeatureSet(id='UCZgzCGfvvQjnHbb8ywh', n=1, registry='core.Feature', hash='a0witEZwk8c1sJvQ0-Vg', updated_at=2023-09-29 14:46:24, modality_id='jUAc2M1C', created_by_id='DzTjkKse')
๐ cell_type (9, bionty.CellType): 'gamma-delta T cell', 'B cell, CD19-positive', 'CD4-positive, alpha-beta T cell', 'cytotoxic T cell', 'dendritic cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD16-positive, CD56-dim natural killer cell, human', 'monocyte', 'CD24-positive, CD4 single-positive thymocyte'
external: FeatureSet(id='9szxE5HWrIb4E8kRB5mJ', n=2, registry='core.Feature', hash='Va1p2Yt0XUK6Qju8q27m', updated_at=2023-09-29 14:46:24, modality_id='jUAc2M1C', created_by_id='DzTjkKse')
๐ assay (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'
๐ species (1, bionty.Species): 'human'
Labels:
๐ท๏ธ species (1, bionty.Species): 'human'
๐ท๏ธ cell_types (9, bionty.CellType): 'gamma-delta T cell', 'B cell, CD19-positive', 'CD4-positive, alpha-beta T cell', 'cytotoxic T cell', 'dendritic cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD16-positive, CD56-dim natural killer cell, human', 'monocyte', 'CD24-positive, CD4 single-positive thymocyte'
๐ท๏ธ experimental_factors (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'
file2.view_flow()
Load files into memory:
file1_adata = file1.load()
file2_adata = file2.load()
Here we compute shared genes without loading files:
file1_genes = file1.features["var"]
file2_genes = file2.features["var"]
shared_genes = file1_genes & file2_genes
len(shared_genes)
749
shared_genes.list("symbol")[:10]
['ARF6',
'TOMM7',
'U2AF1',
'NOSIP',
'GOPC',
'SLC3A2',
'ATP5ME',
'ACAA1',
'MFSD14B',
'CYTL1']
Compare cell types#
file1_celltypes = file1.cell_types.all()
file2_celltypes = file2.cell_types.all()
shared_celltypes = file1_celltypes & file2_celltypes
shared_celltypes_names = shared_celltypes.list("name")
shared_celltypes_names
['gamma-delta T cell', 'CD16-positive, CD56-dim natural killer cell, human']
We can now subset the two datasets by shared cell types:
file1_adata_subset = file1_adata[
file1_adata.obs["cell_type"].isin(shared_celltypes_names)
]
file2_adata_subset = file2_adata[
file2_adata.obs["cell_type"].isin(shared_celltypes_names)
]
Concatenate subsetted datasets:
adata_concat = ad.concat(
[file1_adata_subset, file2_adata_subset],
label="file",
keys=[file1.description, file2.description],
)
adata_concat
AnnData object with n_obs ร n_vars = 187 ร 749
obs: 'cell_type', 'file'
obsm: 'X_umap'
adata_concat.obs.value_counts()
cell_type file
CD16-positive, CD56-dim natural killer cell, human Conde22 114
gamma-delta T cell Conde22 66
10x reference adata 4
CD16-positive, CD56-dim natural killer cell, human 10x reference adata 3
dtype: int64