Skip to content

Commit 6b4d7e6

Browse files
committed
cluster_subclusters
1 parent bbc8850 commit 6b4d7e6

2 files changed

Lines changed: 67 additions & 37 deletions

File tree

src/pySingleCellNet/utils/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
)
2727

2828
from .cell import (
29+
cluster_subclusters,
2930
filter_adata_by_group_size,
3031
rename_cluster_labels,
3132
assign_optimal_cluster,
@@ -67,6 +68,8 @@
6768
"pull_out_genes_v2",
6869
"remove_genes",
6970
"limit_anndata_to_common_genes",
71+
"score_sex",
72+
"cluster_subclusters",
7073
"filter_adata_by_group_size",
7174
"rename_cluster_labels",
7275
"assign_optimal_cluster",

src/pySingleCellNet/utils/cell.py

Lines changed: 64 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -12,73 +12,100 @@
1212

1313

1414
def cluster_subclusters(
15-
adata,
15+
adata: ad.AnnData,
1616
cluster_column: str = 'leiden',
17+
cluster_name: str = None,
18+
layer: str = 'counts',
1719
n_hvg: int = 2000,
1820
n_pcs: int = 40,
1921
n_neighbors: int = 10,
20-
leiden_resolution: float = 0.25
21-
) -> sc.AnnData:
22+
leiden_resolution: float = 0.25,
23+
subcluster_col_name: str = 'subcluster'
24+
) -> None:
2225
"""
23-
For each original cluster in `adata.obs[cluster_column]`, recompute highly variable genes
24-
(from the 'counts' layer, flavor='seurat_v3'), run PCA, build kNN, and re-cluster with Leiden.
25-
Writes a new .obs column 'subcluster' whose labels are prefixed with the original cluster. Assumes that original counts are stored in layer['counts']
26+
Subcluster a specified cluster (or all clusters) within an AnnData object by recomputing HVGs, PCA,
27+
kNN graph, and Leiden clustering. Updates the AnnData object in-place, adding or updating
28+
the `subcluster_col_name` column in `.obs` with new labels prefixed by the original cluster.
2629
27-
Parameters
28-
----------
29-
adata
30-
Input AnnData with a pre-existing clustering in `adata.obs[cluster_column]`.
31-
cluster_column
32-
`.obs` column name holding the original cluster assignment.
33-
n_hvg
34-
Number of highly variable genes per original cluster.
35-
n_pcs
36-
Number of PCs to compute.
37-
n_neighbors
38-
Number of neighbors for kNN graph.
39-
leiden_resolution
40-
Resolution parameter passed to `sc.tl.leiden`.
30+
Args:
31+
adata: AnnData
32+
The AnnData object containing precomputed clusters in `.obs[cluster_column]`.
33+
cluster_column: str, optional
34+
Name of the `.obs` column holding the original cluster assignments. Default is 'leiden'.
35+
cluster_name: str or None, optional
36+
Specific cluster label to subcluster. If `None`, applies to all clusters. Default is None.
37+
layer: str, optional
38+
Layer name in `adata.layers` to use for HVG detection. Default is 'counts'.
39+
n_hvg: int, optional
40+
Number of highly variable genes to select per cluster. Default is 2000.
41+
n_pcs: int, optional
42+
Number of principal components to compute. Default is 40.
43+
n_neighbors: int, optional
44+
Number of neighbors for the kNN graph. Default is 10.
45+
leiden_resolution: float, optional
46+
Resolution parameter for Leiden clustering. Default is 0.25.
47+
subcluster_col_name: str, optional
48+
Name of the `.obs` column to store subcluster labels. Default is 'subcluster'.
4149
42-
Returns
43-
-------
44-
None.
45-
Populates .obs['subcluster']
50+
Raises:
51+
ValueError: If `cluster_column` not in `adata.obs`.
52+
ValueError: If `layer` not in `adata.layers`.
53+
ValueError: If `cluster_name` is specified but not found in `adata.obs[cluster_column]`.
4654
"""
47-
# keep a copy of the original
55+
# Error checking
56+
if cluster_column not in adata.obs:
57+
raise ValueError(f"Cluster column '{cluster_column}' not found in adata.obs")
58+
if layer not in adata.layers:
59+
raise ValueError(f"Layer '{layer}' not found in adata.layers")
60+
61+
# Convert original clusters to string
4862
adata.obs['original_cluster'] = adata.obs[cluster_column].astype(str)
4963

50-
# prepare the column
51-
adata.obs['subcluster'] = None
64+
# Ensure subcluster column exists
65+
adata.obs[subcluster_col_name] = ""
5266

53-
for orig in adata.obs['original_cluster'].unique():
67+
# Validate cluster_name
68+
unique_clusters = adata.obs['original_cluster'].unique()
69+
if cluster_name is not None:
70+
if str(cluster_name) not in unique_clusters:
71+
raise ValueError(
72+
f"Cluster '{cluster_name}' not found in adata.obs['{cluster_column}']"
73+
)
74+
clusters_to_process = [str(cluster_name)]
75+
else:
76+
clusters_to_process = unique_clusters
77+
78+
# Iterate and subcluster
79+
for orig in clusters_to_process:
5480
mask = adata.obs['original_cluster'] == orig
5581
sub = adata[mask].copy()
5682

57-
# 1) HVG
83+
# 1) Compute HVGs
5884
sc.pp.highly_variable_genes(
5985
sub,
6086
flavor='seurat_v3',
6187
n_top_genes=n_hvg,
62-
layer='counts'
88+
layer=layer
6389
)
90+
6491
# 2) PCA
6592
sc.pp.pca(sub, n_comps=n_pcs, use_highly_variable=True)
93+
6694
# 3) kNN
6795
sc.pp.neighbors(sub, n_neighbors=n_neighbors, use_rep='X_pca')
96+
6897
# 4) Leiden
6998
sc.tl.leiden(
7099
sub,
71100
resolution=leiden_resolution,
72101
flavor='igraph',
73-
n_iterations=2
102+
n_iterations=2,
103+
key_added='leiden_sub'
74104
)
75105

76-
# build labels like "2_0", "2_1", etc.
77-
prefixed = orig + "_" + sub.obs['leiden'].astype(str)
78-
adata.obs.loc[mask, 'subcluster'] = prefixed.values
79-
80-
return adata
81-
106+
# Prefix and assign back
107+
labels = (orig + "_" + sub.obs['leiden_sub'].astype(str)).values
108+
adata.obs.loc[mask, subcluster_col_name] = labels
82109

83110

84111

0 commit comments

Comments
 (0)