Query individual files#

scRNA-seq data integration is the process of analyzing data from several scRNA sequencing experiments to uncover common or distinct biological insights and patterns.

Here, we’ll demonstrate how to fetch two scRNA-seq datasets by registered metadata such as cell types to finally integrate them.

import lamindb as ln
import lnschema_bionty as lb
import anndata as ad

💡 loaded instance: testuser1/test-scrna (lamindb 0.54.3)

ln.track()

💡 notebook imports: anndata==0.9.2 lamindb==0.54.3 lnschema_bionty==0.31.2

💡 Transform(id='agayZTonayqAz8', name='Query individual files', short_name='scrna2', version='0', type=notebook, updated_at=2023-09-29 14:46:36, created_by_id='DzTjkKse')

💡 Run(id='kJNEKVsuf6S5JDofFIgB', run_at=2023-09-29 14:46:36, transform_id='agayZTonayqAz8', created_by_id='DzTjkKse')

Access #

Query files by provenance metadata#

users = ln.User.lookup()

ln.Transform.filter(created_by=users.testuser1).search("scrna")

	id	__ratio__
name
scRNA-seq	Nv48yAceNSh8z8	90.0
Append a new batch of data	ManDYgmftZ8Cz8	36.0
Query individual files	agayZTonayqAz8	36.0

transform = ln.Transform.filter(id="Nv48yAceNSh8z8").one()

ln.File.filter(transform=transform).df()

	storage_id	key	suffix	accessor	description	version	size	hash	hash_type	transform_id	run_id	initial_version_id	updated_at	created_by_id
id
nV0w72HVEfJeK6lgb7BO	975nKuX0	None	.h5ad	AnnData	Conde22	None	28049505	WEFcMZxJNmMiUOFrcSTaig	md5	Nv48yAceNSh8z8	Pd5UweAXC3cCH1aOMdr1	None	2023-09-29 14:45:51	DzTjkKse

Query files based on biological metadata#

assays = lb.ExperimentalFactor.lookup()
species = lb.Species.lookup()
cell_types = lb.CellType.lookup()

query = ln.File.filter(
    experimental_factors=assays.single_cell_rna_sequencing,
    species=species.human,
    cell_types=cell_types.gamma_delta_t_cell,
)

query.df()

	storage_id	key	suffix	accessor	description	version	size	hash	hash_type	transform_id	run_id	initial_version_id	updated_at	created_by_id
id
Vf6s6oe8cTQ8oqMCCjeT	975nKuX0	None	.h5ad	AnnData	10x reference adata	None	660792	a2V0IgOjMRHsCeZH169UOQ	md5	ManDYgmftZ8Cz8	0eCAXKvC5AGTwRu42M0X	None	2023-09-29 14:46:24	DzTjkKse
nV0w72HVEfJeK6lgb7BO	975nKuX0	None	.h5ad	AnnData	Conde22	None	28049505	WEFcMZxJNmMiUOFrcSTaig	md5	Nv48yAceNSh8z8	Pd5UweAXC3cCH1aOMdr1	None	2023-09-29 14:45:51	DzTjkKse

Transform #

Compare gene sets#

Get file objects:

query = ln.File.filter()

file1, file2 = query.list()

file1.describe()

File(id='nV0w72HVEfJeK6lgb7BO', suffix='.h5ad', accessor='AnnData', description='Conde22', size=28049505, hash='WEFcMZxJNmMiUOFrcSTaig', hash_type='md5', updated_at=2023-09-29 14:45:51)

Provenance:
  🗃️ storage: Storage(id='975nKuX0', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-09-29 14:44:56, created_by_id='DzTjkKse')
  📔 transform: Transform(id='Nv48yAceNSh8z8', name='scRNA-seq', short_name='scrna', version='0', type='notebook', updated_at=2023-09-29 14:45:51, created_by_id='DzTjkKse')
  👣 run: Run(id='Pd5UweAXC3cCH1aOMdr1', run_at=2023-09-29 14:44:58, transform_id='Nv48yAceNSh8z8', created_by_id='DzTjkKse')
  👤 created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-09-29 14:44:56)
  ⬇️ input_of (core.Run): ['2023-09-29 14:45:55']
Features:
  var: FeatureSet(id='EJQhGDAUMVCAW7a878Is', n=36503, type='number', registry='bionty.Gene', hash='dnRexHCtxtmOU81_EpoJ', updated_at=2023-09-29 14:45:40, modality_id='Tkw6vO00', created_by_id='DzTjkKse')
    'KRT4', 'LHX5-AS1', 'OPN1LW', 'STYX', 'CCDC158', 'None', 'None', 'MKNK1-AS1', 'None', 'None', 'LNPEP', 'LINC02485', 'None', 'None', 'IGHEP2', 'CYFIP1', 'A4GNT', 'SLC14A2', 'None', 'None', ...
  obs: FeatureSet(id='KEEZXO20pmTjLPROaTDE', n=4, registry='core.Feature', hash='NUCABLKrrAle7o2cv7hj', updated_at=2023-09-29 14:45:45, modality_id='jUAc2M1C', created_by_id='DzTjkKse')
    🔗 donor (12, core.ULabel): '621B', '640C', 'D496', 'A52', 'A36', '582C', 'A29', 'D503', 'A37', 'A31', ...
    🔗 cell_type (32, bionty.CellType): 'progenitor cell', 'T follicular helper cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'dendritic cell, human', 'CD8-positive, alpha-beta memory T cell, CD45RO-positive', 'classical monocyte', 'regulatory T cell', 'group 3 innate lymphoid cell', 'CD4-positive helper T cell', 'non-classical monocyte', ...
    🔗 assay (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 5' v2', '10x 3' v3', '10x 5' v1'
    🔗 tissue (17, bionty.Tissue): 'mesenteric lymph node', 'lung', 'sigmoid colon', 'thymus', 'lamina propria', 'bone marrow', 'ileum', 'jejunal epithelium', 'spleen', 'thoracic lymph node', ...
Labels:
  🏷️ species (1, bionty.Species): 'human'
  🏷️ tissues (17, bionty.Tissue): 'mesenteric lymph node', 'lung', 'sigmoid colon', 'thymus', 'lamina propria', 'bone marrow', 'ileum', 'jejunal epithelium', 'spleen', 'thoracic lymph node', ...
  🏷️ cell_types (32, bionty.CellType): 'progenitor cell', 'T follicular helper cell', 'naive thymus-derived CD8-positive, alpha-beta T cell', 'dendritic cell, human', 'CD8-positive, alpha-beta memory T cell, CD45RO-positive', 'classical monocyte', 'regulatory T cell', 'group 3 innate lymphoid cell', 'CD4-positive helper T cell', 'non-classical monocyte', ...
  🏷️ experimental_factors (4, bionty.ExperimentalFactor): 'single-cell RNA sequencing', '10x 5' v2', '10x 3' v3', '10x 5' v1'
  🏷️ ulabels (12, core.ULabel): '621B', '640C', 'D496', 'A52', 'A36', '582C', 'A29', 'D503', 'A37', 'A31', ...

file1.view_flow()

https://d33wubrfki0l68.cloudfront.net/e56d119d4145929bec2920c04c78c96c38c9e500/666e5/_images/b02d47ec30b1e9abf438735cde6c22495a2e98263361fa7bb9338c2dc4bf111d.svg

file2.describe()

File(id='Vf6s6oe8cTQ8oqMCCjeT', suffix='.h5ad', accessor='AnnData', description='10x reference adata', size=660792, hash='a2V0IgOjMRHsCeZH169UOQ', hash_type='md5', updated_at=2023-09-29 14:46:24)

Provenance:
  🗃️ storage: Storage(id='975nKuX0', root='/home/runner/work/lamin-usecases/lamin-usecases/docs/test-scrna', type='local', updated_at=2023-09-29 14:44:56, created_by_id='DzTjkKse')
  📔 transform: Transform(id='ManDYgmftZ8Cz8', name='Append a new batch of data', short_name='scrna1', version='0', type='notebook', updated_at=2023-09-29 14:46:25, created_by_id='DzTjkKse')
  👣 run: Run(id='0eCAXKvC5AGTwRu42M0X', run_at=2023-09-29 14:45:55, transform_id='ManDYgmftZ8Cz8', created_by_id='DzTjkKse')
  👤 created_by: User(id='DzTjkKse', handle='testuser1', email='testuser1@lamin.ai', name='Test User1', updated_at=2023-09-29 14:44:56)
Features:
  var: FeatureSet(id='9ZyNwE85EcgUEwAEUkkJ', n=754, type='number', registry='bionty.Gene', hash='WMDxN7253SdzGwmznV5d', updated_at=2023-09-29 14:46:24, modality_id='Tkw6vO00', created_by_id='DzTjkKse')
    'COMMD5', 'MTPN', 'TNFRSF4', 'ANKRD12', 'IL32', 'MATK', 'EIF3G', 'JAML', 'SERPINF1', 'MARCKSL1', 'COA1', 'IGHA1', 'ATP5MF', 'TXN', 'UQCRC1', 'HNRNPK', 'CRIP1', 'SDHC', 'PSMC5', 'S1PR4', ...
  obs: FeatureSet(id='UCZgzCGfvvQjnHbb8ywh', n=1, registry='core.Feature', hash='a0witEZwk8c1sJvQ0-Vg', updated_at=2023-09-29 14:46:24, modality_id='jUAc2M1C', created_by_id='DzTjkKse')
    🔗 cell_type (9, bionty.CellType): 'gamma-delta T cell', 'B cell, CD19-positive', 'CD4-positive, alpha-beta T cell', 'cytotoxic T cell', 'dendritic cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD16-positive, CD56-dim natural killer cell, human', 'monocyte', 'CD24-positive, CD4 single-positive thymocyte'
  external: FeatureSet(id='9szxE5HWrIb4E8kRB5mJ', n=2, registry='core.Feature', hash='Va1p2Yt0XUK6Qju8q27m', updated_at=2023-09-29 14:46:24, modality_id='jUAc2M1C', created_by_id='DzTjkKse')
    🔗 assay (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'
    🔗 species (1, bionty.Species): 'human'
Labels:
  🏷️ species (1, bionty.Species): 'human'
  🏷️ cell_types (9, bionty.CellType): 'gamma-delta T cell', 'B cell, CD19-positive', 'CD4-positive, alpha-beta T cell', 'cytotoxic T cell', 'dendritic cell', 'CD8-positive, CD25-positive, alpha-beta regulatory T cell', 'CD16-positive, CD56-dim natural killer cell, human', 'monocyte', 'CD24-positive, CD4 single-positive thymocyte'
  🏷️ experimental_factors (1, bionty.ExperimentalFactor): 'single-cell RNA sequencing'

file2.view_flow()

https://d33wubrfki0l68.cloudfront.net/1d1c2f24c076cc64ca65919dd8a057fc15edd89a/9f607/_images/6ac07902e3fee97339f9c9ba281dcc06554a79ed5641fccc86c1aed659a7916b.svg

Load files into memory:

file1_adata = file1.load()
file2_adata = file2.load()

Here we compute shared genes without loading files:

file1_genes = file1.features["var"]
file2_genes = file2.features["var"]

shared_genes = file1_genes & file2_genes
len(shared_genes)

shared_genes.list("symbol")[:10]

['ARF6',
 'TOMM7',
 'U2AF1',
 'NOSIP',
 'GOPC',
 'SLC3A2',
 'ATP5ME',
 'ACAA1',
 'MFSD14B',
 'CYTL1']

Compare cell types#

file1_celltypes = file1.cell_types.all()
file2_celltypes = file2.cell_types.all()

shared_celltypes = file1_celltypes & file2_celltypes
shared_celltypes_names = shared_celltypes.list("name")
shared_celltypes_names

['gamma-delta T cell', 'CD16-positive, CD56-dim natural killer cell, human']

We can now subset the two datasets by shared cell types:

file1_adata_subset = file1_adata[
    file1_adata.obs["cell_type"].isin(shared_celltypes_names)
]

file2_adata_subset = file2_adata[
    file2_adata.obs["cell_type"].isin(shared_celltypes_names)
]

Concatenate subsetted datasets:

adata_concat = ad.concat(
    [file1_adata_subset, file2_adata_subset],
    label="file",
    keys=[file1.description, file2.description],
)
adata_concat

AnnData object with n_obs × n_vars = 187 × 749
    obs: 'cell_type', 'file'
    obsm: 'X_umap'

adata_concat.obs.value_counts()

cell_type                                           file               
CD16-positive, CD56-dim natural killer cell, human  Conde22                114
gamma-delta T cell                                  Conde22                 66
                                                    10x reference adata      4
CD16-positive, CD56-dim natural killer cell, human  10x reference adata      3
dtype: int64