Junction Data Processing

Utilities for converting SiCeLoRe-style junction matrices into AnnData Zarr format for scalable single-cell analysis.

juncmatrix_to_zarr


def juncmatrix_to_zarr(
    junc_path:Union[str, Path], out_zarr:Union[str, Path], sample:Optional[str]=None,
    gtf:Optional[Union[str, Path]]=None, threads:int=4, chunksize:int=20000, estimate_total:bool=False,
    gene_progress:bool=False, cellmeta_path:Optional[Union[str, Path]]=None, barcode_prefix:Optional[str]=None,
    overwrite:bool=False, show_progress:bool=True
)->Path:

Convert one SiCeLoRe-style juncmatrix(.txt|.gz) to a sparse AnnData Zarr store.

Output AnnData: - X: sparse CSR, shape (cells × junctions) - obs: barcode index (prefixed), includes ‘sample’ + optional cell metadata - var: index is junction id, contains ONLY ‘geneId’ where geneId is: - Ensembl gene_id if GTF mapping succeeds - otherwise the junction prefix (gene symbol) so it is NEVER NaN


batch_juncmatrix_to_zarr


def batch_juncmatrix_to_zarr(
    inputs:Union[str, Path, Sequence[Union[str, Path]]], outroot:Union[str, Path],
    gtf:Optional[Union[str, Path]]=None, threads:int=4, chunksize:int=20000, estimate_total:bool=False,
    gene_progress:bool=False, overwrite:bool=False, show_progress:bool=True
)->list[Path]:

Call self as a function.


merge_junction_zarrs


def merge_junction_zarrs(
    root_or_paths:Union[str, Path, Sequence[Union[str, Path]]], out_zarr:Union[str, Path], join:str='outer',
    sample_key:str='sample', ensure_unique_barcodes:bool=True, skip_hidden:bool=True,
    skip_names:Iterable[str]=('ANALYSIS',), overwrite:bool=False, show_progress:bool=True
)->Path:

Call self as a function.

Usage

An optional GTF file can be provided to map junction gene-name prefixes to Ensembl gene IDs. Download a GTF for your organism if needed:

from pathlib import Path
import urllib.request

data_dir = Path('../data')
data_dir.mkdir(parents=True, exist_ok=True)

gtf_url = 'ftp://ftp.ensembl.org/pub/release-102/gtf/mus_musculus/Mus_musculus.GRCm38.102.gtf.gz'
gtf_local = data_dir / 'Mus_musculus.GRCm38.102.gtf.gz'

if not gtf_local.exists():
    print('Downloading GRCm38 GTF ...')
    urllib.request.urlretrieve(gtf_url, gtf_local)

print('Done:', gtf_local)

Convert a single juncmatrix to Zarr

juncmatrix_to_zarr(
    '/data/analysis/data_mcandrew/00_allos_0.6_TAB/data/s190_juncmatrix.txt',
    '/data/analysis/data_mcandrew/00_allos_0.6_TAB/data/s190_junc_adta.zarr',
    gtf=gtf_local,
    threads=30,
    chunksize=20_000,
    estimate_total=True,
    gene_progress=True,
)

Batch convert and merge multiple samples

# Merge per-sample Zarr stores into a single cohort Zarr
from pathlib import Path

z1 = Path('/data/analysis/data_mcandrew/00_allos_0.6_TAB/data/s190_junc_adta.zarr')
z2 = Path('/data/analysis/data_mcandrew/00_allos_0.6_TAB/data/s951_junc_adta.zarr')
out_merged = Path('/data/analysis/data_mcandrew/00_allos_0.6_TAB/data/s190_s951_junc_merged.zarr')

merged_path = merge_junction_zarrs(
    [z1, z2],
    out_merged,
    join='outer',
    sample_key='sample',
    overwrite=True,
)
print('Merged written to:', merged_path)
import anndata as ad

A = ad.read_zarr(str(out_merged))
print(A)
print('shape:', A.shape)
print('obs cols:', list(A.obs.columns))
print('var cols:', list(A.var.columns))
if 'sample' in A.obs:
    print(A.obs['sample'].value_counts())