from pathlib import Path
import urllib.request
data_dir = Path('../data')
data_dir.mkdir(parents=True, exist_ok=True)
gtf_url = 'ftp://ftp.ensembl.org/pub/release-102/gtf/mus_musculus/Mus_musculus.GRCm38.102.gtf.gz'
gtf_local = data_dir / 'Mus_musculus.GRCm38.102.gtf.gz'
if not gtf_local.exists():
print('Downloading GRCm38 GTF ...')
urllib.request.urlretrieve(gtf_url, gtf_local)
print('Done:', gtf_local)Junction Data Processing
Utilities for converting SiCeLoRe-style junction matrices into AnnData Zarr format for scalable single-cell analysis.
juncmatrix_to_zarr
def juncmatrix_to_zarr(
junc_path:Union[str, Path], out_zarr:Union[str, Path], sample:Optional[str]=None,
gtf:Optional[Union[str, Path]]=None, threads:int=4, chunksize:int=20000, estimate_total:bool=False,
gene_progress:bool=False, cellmeta_path:Optional[Union[str, Path]]=None, barcode_prefix:Optional[str]=None,
overwrite:bool=False, show_progress:bool=True
)->Path:
Convert one SiCeLoRe-style juncmatrix(.txt|.gz) to a sparse AnnData Zarr store.
Output AnnData: - X: sparse CSR, shape (cells × junctions) - obs: barcode index (prefixed), includes ‘sample’ + optional cell metadata - var: index is junction id, contains ONLY ‘geneId’ where geneId is: - Ensembl gene_id if GTF mapping succeeds - otherwise the junction prefix (gene symbol) so it is NEVER NaN
batch_juncmatrix_to_zarr
def batch_juncmatrix_to_zarr(
inputs:Union[str, Path, Sequence[Union[str, Path]]], outroot:Union[str, Path],
gtf:Optional[Union[str, Path]]=None, threads:int=4, chunksize:int=20000, estimate_total:bool=False,
gene_progress:bool=False, overwrite:bool=False, show_progress:bool=True
)->list[Path]:
Call self as a function.
merge_junction_zarrs
def merge_junction_zarrs(
root_or_paths:Union[str, Path, Sequence[Union[str, Path]]], out_zarr:Union[str, Path], join:str='outer',
sample_key:str='sample', ensure_unique_barcodes:bool=True, skip_hidden:bool=True,
skip_names:Iterable[str]=('ANALYSIS',), overwrite:bool=False, show_progress:bool=True
)->Path:
Call self as a function.
Usage
An optional GTF file can be provided to map junction gene-name prefixes to Ensembl gene IDs. Download a GTF for your organism if needed:
Convert a single juncmatrix to Zarr
juncmatrix_to_zarr(
'/data/analysis/data_mcandrew/00_allos_0.6_TAB/data/s190_juncmatrix.txt',
'/data/analysis/data_mcandrew/00_allos_0.6_TAB/data/s190_junc_adta.zarr',
gtf=gtf_local,
threads=30,
chunksize=20_000,
estimate_total=True,
gene_progress=True,
)Batch convert and merge multiple samples
# Merge per-sample Zarr stores into a single cohort Zarr
from pathlib import Path
z1 = Path('/data/analysis/data_mcandrew/00_allos_0.6_TAB/data/s190_junc_adta.zarr')
z2 = Path('/data/analysis/data_mcandrew/00_allos_0.6_TAB/data/s951_junc_adta.zarr')
out_merged = Path('/data/analysis/data_mcandrew/00_allos_0.6_TAB/data/s190_s951_junc_merged.zarr')
merged_path = merge_junction_zarrs(
[z1, z2],
out_merged,
join='outer',
sample_key='sample',
overwrite=True,
)
print('Merged written to:', merged_path)import anndata as ad
A = ad.read_zarr(str(out_merged))
print(A)
print('shape:', A.shape)
print('obs cols:', list(A.obs.columns))
print('var cols:', list(A.var.columns))
if 'sample' in A.obs:
print(A.obs['sample'].value_counts())