Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added outputs/SRR11940660/pca_pca_leiden.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added outputs/SRR11940660/pca_pca_scatter.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
31 changes: 31 additions & 0 deletions outputs/SRR11940660/top_genes_PC1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
gene,loading,abs_loading
gene-U712_00580,0.12533931,0.12533931
gene-U712_00585,0.10191295,0.10191295
gene-U712_00700,0.09746355,0.09746355
gene-U712_00735,0.09316955,0.09316955
gene-U712_00625,0.091827266,0.091827266
gene-U712_00600,0.09159618,0.09159618
gene-U712_10210,0.091315836,0.091315836
gene-U712_00695,0.09052001,0.09052001
gene-U712_16965,0.089866,0.089866
gene-U712_07665,0.086747095,0.086747095
gene-U712_00705,0.08656324,0.08656324
gene-U712_07310,0.08565827,0.08565827
gene-U712_07675,0.08170622,0.08170622
gene-U712_00570,0.08053239,0.08053239
gene-U712_19840,0.0794902,0.0794902
gene-U712_07300,0.07897619,0.07897619
gene-U712_00560,0.0770236,0.0770236
gene-U712_00615,0.07680274,0.07680274
gene-U712_00655,0.07668921,0.07668921
gene-U712_20290,0.076166674,0.076166674
gene-U712_20750,0.07560649,0.07560649
gene-U712_00630,0.07493979,0.07493979
gene-U712_00610,0.07472653,0.07472653
gene-U712_13780,0.07378898,0.07378898
gene-U712_13925,0.07184191,0.07184191
gene-U712_20755,0.071589574,0.071589574
gene-U712_00665,0.07119978,0.07119978
gene-U712_16980,0.07061581,0.07061581
gene-U712_19250,0.07054084,0.07054084
gene-U712_04730,0.07048743,0.07048743
31 changes: 31 additions & 0 deletions outputs/SRR11940660/top_genes_PC2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
gene,loading,abs_loading
gene-U712_14360,0.19461678,0.19461678
gene-U712_08455,0.16187526,0.16187526
gene-U712_19480,0.1406494,0.1406494
gene-U712_14420,0.13294719,0.13294719
gene-U712_08460,0.13027942,0.13027942
gene-U712_05870,0.12474084,0.12474084
gene-U712_04095,0.12371954,0.12371954
gene-U712_04090,0.12287305,0.12287305
gene-U712_19490,0.12224303,0.12224303
gene-U712_10210,0.122157894,0.122157894
gene-U712_04085,0.121980615,0.121980615
gene-U712_14415,0.11803285,0.11803285
gene-U712_09470,0.11435959,0.11435959
gene-U712_19485,0.11332339,0.11332339
gene-U712_01805,0.11167637,0.11167637
gene-U712_04100,0.11119464,0.11119464
gene-U712_20290,0.108160555,0.108160555
gene-U712_15180,0.10660463,0.10660463
gene-U712_15910,0.10639286,0.10639286
gene-U712_01810,0.10413831,0.10413831
gene-U712_11735,0.10216933,0.10216933
gene-U712_14425,0.101383075,0.101383075
gene-U712_10180,0.09884069,0.09884069
gene-U712_15925,0.09504657,0.09504657
gene-U712_03020,0.09022796,0.09022796
gene-U712_14355,0.08683693,0.08683693
gene-U712_17700,0.08638535,0.08638535
gene-U712_11720,0.086365364,0.086365364
gene-U712_11730,0.08547053,0.08547053
gene-U712_11725,0.08386802,0.08386802
Binary file added outputs/SRR11940660/umap_umap_leiden.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added outputs/SRR11940660/umap_umap_qc.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added outputs/SRR11940661/pca_pca_leiden.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added outputs/SRR11940661/pca_pca_scatter.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
31 changes: 31 additions & 0 deletions outputs/SRR11940661/top_genes_PC1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
gene,loading,abs_loading
gene-U712_00580,0.11417147,0.11417147
gene-U712_10210,0.107354835,0.107354835
gene-U712_00585,0.1014146,0.1014146
gene-U712_00700,0.08827221,0.08827221
gene-U712_00735,0.08574556,0.08574556
gene-U712_20290,0.08477078,0.08477078
gene-U712_16965,0.08411337,0.08411337
gene-U712_00695,0.08332969,0.08332969
gene-U712_19840,0.08085494,0.08085494
gene-U712_00625,0.07938439,0.07938439
gene-U712_00705,0.07869947,0.07869947
gene-U712_07665,0.07662746,0.07662746
gene-U712_06215,0.074582025,0.074582025
gene-U712_07310,0.07336663,0.07336663
gene-U712_07675,0.07330899,0.07330899
gene-U712_00560,0.07280758,0.07280758
gene-U712_06220,0.071887605,0.071887605
gene-U712_00600,0.07138487,0.07138487
gene-U712_18050,0.07114354,0.07114354
gene-U712_01810,0.0710372,0.0710372
gene-U712_18700,0.07083048,0.07083048
gene-U712_19250,0.06929057,0.06929057
gene-U712_01805,0.06784177,0.06784177
gene-U712_03020,0.067316405,0.067316405
gene-U712_20285,0.06691975,0.06691975
gene-U712_08455,0.0666639,0.0666639
gene-U712_07680,0.06645966,0.06645966
gene-U712_10205,0.065749034,0.065749034
gene-U712_16300,0.065217495,0.065217495
gene-U712_09415,0.064948425,0.064948425
31 changes: 31 additions & 0 deletions outputs/SRR11940661/top_genes_PC2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
gene,loading,abs_loading
gene-U712_19480,0.1653905,0.1653905
gene-U712_19490,0.15668231,0.15668231
gene-U712_14360,0.14526144,0.14526144
gene-U712_19485,0.14301682,0.14301682
gene-U712_08455,0.1222755,0.1222755
gene-U712_05870,0.12153367,0.12153367
gene-U712_09470,0.11716664,0.11716664
gene-U712_14420,0.114116274,0.114116274
gene-U712_19495,0.11070127,0.11070127
gene-U712_11735,0.105555564,0.105555564
gene-U712_08460,0.103325754,0.103325754
gene-U712_14415,0.10313875,0.10313875
gene-U712_11725,0.10097674,0.10097674
gene-U712_06215,-0.09902469,0.09902469
gene-U712_06220,-0.09858269,0.09858269
gene-U712_10210,0.09832671,0.09832671
gene-U712_14425,0.09571376,0.09571376
gene-U712_11720,0.095571786,0.095571786
gene-U712_11730,0.093575805,0.093575805
gene-U712_11740,0.0906311,0.0906311
gene-U712_11715,0.08936489,0.08936489
gene-U712_02265,0.08880372,0.08880372
gene-U712_00600,-0.08737269,0.08737269
gene-U712_15910,0.085302696,0.085302696
gene-U712_01805,0.084308736,0.084308736
gene-U712_20290,0.08309676,0.08309676
gene-U712_19860,-0.08125018,0.08125018
gene-U712_08905,0.08097693,0.08097693
gene-U712_11710,0.08042957,0.08042957
gene-U712_19855,-0.08037939,0.08037939
Binary file added outputs/SRR11940661/umap_umap_leiden.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added outputs/SRR11940661/umap_umap_qc.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
234 changes: 234 additions & 0 deletions scripts/scrna_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
#!/usr/bin/env python3
import os
from os.path import join
import argparse

import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib
matplotlib.use("Agg") # ensure headless plotting
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import median_abs_deviation
from BCBio import GFF
import scanpy as sc


# ---------------------------
# Helpers
# ---------------------------
def get_protein_id(feature):
try:
protein_id = feature.sub_features[0].qualifiers.get("protein_id")
return protein_id[0] if protein_id else ""
except Exception:
return ""

def get_product_name(feature):
try:
product = feature.sub_features[0].qualifiers.get("product")
return product[0] if product else ""
except Exception:
return ""

def parse_gff_build_maps(gff_file):
protein_ids, feature_ids, feature_types, product_names = [], [], [], []
with open(gff_file) as in_handle:
for rec in GFF.parse(in_handle):
for feature in rec.features:
if feature.type == "gene":
protein_ids.append(get_protein_id(feature))
product_names.append(get_product_name(feature))
feature_ids.append(feature.qualifiers["ID"][0])
feature_types.append(feature.qualifiers.get("gene_biotype", [""])[0])
feature_types_map = dict(zip(feature_ids, feature_types))
protein_ids_map = dict(zip(feature_ids, protein_ids))
product_names_map = dict(zip(feature_ids, product_names))
return feature_types_map, protein_ids_map, product_names_map

def calculate_qc_metrics(adata):
sc.pp.calculate_qc_metrics(
adata, qc_vars=["rRNA", "tRNA"], inplace=True, percent_top=[20], log1p=True
)

def is_outlier(adata, metric: str, nmads: int):
M = adata.obs[metric]
outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
np.median(M) + nmads * median_abs_deviation(M) < M
)
return outlier

def top_genes_for_pc(loadings_col, genes, n=30):
abs_vals = np.abs(loadings_col)
idx = np.argsort(abs_vals)[::-1][:n]
return pd.DataFrame({
"gene": genes[idx],
"loading": loadings_col[idx],
"abs_loading": abs_vals[idx],
})


# ---------------------------
# Main pipeline
# ---------------------------
def main(args):
os.makedirs(args.outdir, exist_ok=True)
sc.settings.figdir = args.outdir # where scanpy will save figures
sc.settings.autoshow = False

# --- GFF annotations ---
feature_types_map, protein_ids_map, product_names_map = parse_gff_build_maps(args.gff)

# --- Read data ---
adata = sc.read_10x_mtx(args.input, var_names="gene_ids", cache=True)
print(adata)

# --- Annotate genes ---
adata.var["gene_type"] = adata.var_names.map(feature_types_map)
adata = adata[:, ~adata.var["gene_type"].isna()].copy()
adata.var["protein_id"] = adata.var_names.map(protein_ids_map)
adata.var["product_names"] = adata.var_names.map(product_names_map)

# Mark rRNA and tRNA genes
adata.var["rRNA"] = adata.var["gene_type"].str.contains("rRNA", case=False, na=False)
adata.var["tRNA"] = adata.var["gene_type"].str.contains("tRNA", case=False, na=False)

# --- QC (initial) ---
calculate_qc_metrics(adata)
print(f"Total number of cells (raw): {adata.n_obs}")

# --- Filter low counts ---
adata_filter = adata.copy()
sc.pp.filter_cells(adata_filter, min_counts=args.min_counts)
print(f"Total number of cells (min_counts≥{args.min_counts}): {adata_filter.n_obs}")

# --- Identify outliers ---
# These columns exist after calculate_qc_metrics(log1p=True, percent_top=[20])
adata_filter.obs["outlier"] = (
is_outlier(adata_filter, "log1p_total_counts", 5)
| is_outlier(adata_filter, "log1p_n_genes_by_counts", 5)
| is_outlier(adata_filter, "pct_counts_in_top_20_genes", 5)
| is_outlier(adata_filter, "pct_counts_rRNA", 5)
)
print(adata_filter.obs["outlier"].value_counts())

# --- Remove outliers and tRNA-high cells (if available) ---
adata_filter = adata_filter[~adata_filter.obs["outlier"]].copy()
if "pct_counts_tRNA" in adata_filter.obs.columns:
adata_filter = adata_filter[adata_filter.obs["pct_counts_tRNA"] < 10].copy()

# --- Recalculate QC metrics after filtering ---
calculate_qc_metrics(adata_filter)

# --- Remove rRNA and tRNA genes from matrix (keeps obs QC cols intact) ---
adata_filter = adata_filter[:, ~(adata_filter.var["rRNA"] | adata_filter.var["tRNA"])].copy()
print(f"Final shape after filtering: {adata_filter.shape}")

# ======================================================
# Layers: counts, norm, log1p
# ======================================================
adata_filter.layers["counts"] = adata_filter.X.copy() # raw counts layer

norm_res = sc.pp.normalize_total(adata_filter, target_sum=args.target_sum, inplace=False)
adata_filter.layers["norm"] = norm_res["X"].copy()

adata_filter.layers["log1p"] = np.log1p(adata_filter.layers["norm"])
adata_filter.X = adata_filter.layers["log1p"].copy() # use for downstream

# ======================================================
# PCA & plots
# ======================================================
# Scale (sparse-safe)
if sp.issparse(adata_filter.X):
sc.pp.scale(adata_filter, max_value=10, zero_center=False)
else:
sc.pp.scale(adata_filter, max_value=10)

# PCA (match zero_center path)
if sp.issparse(adata_filter.X):
sc.tl.pca(adata_filter, svd_solver="arpack", zero_center=False)
else:
sc.tl.pca(adata_filter, svd_solver="arpack")

# Save PCA variance ratio and PCA scatter
sc.pl.pca_variance_ratio(adata_filter, log=True, show=False, save="_pca_variance.png")
sc.pl.pca(
adata_filter,
color=["total_counts", "n_genes_by_counts", "pct_counts_rRNA", "pct_counts_tRNA"],
show=False, save="_pca_scatter.png"
)

# Identify genes that impact PC1/PC2 the most & save
loadings = adata_filter.varm["PCs"] # (n_genes, n_pcs)
genes = adata_filter.var_names
top_pc1 = top_genes_for_pc(loadings[:, 0], genes, n=args.top_loading_genes)
top_pc2 = top_genes_for_pc(loadings[:, 1], genes, n=args.top_loading_genes)
top_pc1.to_csv(join(args.outdir, "top_genes_PC1.csv"), index=False)
top_pc2.to_csv(join(args.outdir, "top_genes_PC2.csv"), index=False)

# Bar plots for loadings
sc.pl.pca_loadings(adata_filter, components=[1, 2], n_points=args.top_loading_genes,
show=False, save="_pca_loadings_1_2.png")

# ======================================================
# Neighbors / UMAP / Leiden
# ======================================================
sc.pp.neighbors(adata_filter, n_neighbors=args.n_neighbors, n_pcs=args.n_pcs)
sc.tl.umap(adata_filter)
sc.pl.umap(
adata_filter,
color=["total_counts", "n_genes_by_counts", "pct_counts_rRNA", "pct_counts_tRNA"],
show=False, save="_umap_qc.png"
)

sc.tl.leiden(adata_filter, resolution=args.resolution)
sc.pl.pca(adata_filter, color=["leiden"], show=False, save="_pca_leiden.png")
sc.pl.umap(adata_filter, color=["leiden"], legend_loc="on data", show=False, save="_umap_leiden.png")

# ======================================================
# Differential expression & heatmaps
# ======================================================
sc.tl.rank_genes_groups(adata_filter, groupby="leiden", method="wilcoxon")
sc.pl.rank_genes_groups(adata_filter, n_genes=10, sharey=False, show=False, save="_rank_genes.png")

# Dendrogram must match current leiden categories (recompute defensively)
adata_filter.obs["leiden"] = adata_filter.obs["leiden"].astype("category")
adata_filter.obs["leiden"] = adata_filter.obs["leiden"].cat.remove_unused_categories()
adata_filter.uns.pop("dendrogram_leiden", None)
sc.tl.dendrogram(adata_filter, groupby="leiden")

sc.pl.rank_genes_groups_heatmap(
adata_filter, n_genes=5, groupby="leiden", show=False, show_gene_labels=True,
save="_rank_genes_heatmap.png"
)

# Also save the AnnData object
adata_out = join(args.outdir, "adata_filtered.h5ad")
adata_filter.write(adata_out)
print(f"\n✓ Done. Results saved in: {args.outdir}")
print(f" - AnnData: {adata_out}")
print(" - Figures: saved by Scanpy into that directory (filenames start with the suffixes given to `save=`).")
print(" - Tables: top_genes_PC1.csv, top_genes_PC2.csv")


# ---------------------------
# CLI
# ---------------------------
if __name__ == "__main__":
p = argparse.ArgumentParser(
description="Scanpy pipeline with GFF annotation, layers (counts/norm/log1p), PCA/UMAP/Leiden, and saved plots."
)
p.add_argument("--input", required=True,
help="Path to 10X mtx directory (folder containing matrix.mtx, barcodes.tsv, features.tsv).")
p.add_argument("--gff", required=True, help="GFF annotation file.")
p.add_argument("--outdir", required=True, help="Output directory for plots, tables, and h5ad.")
p.add_argument("--min-counts", type=int, default=20, help="Minimum counts per cell for filtering.")
p.add_argument("--target-sum", type=float, default=1e4, help="Library size target for normalize_total.")
p.add_argument("--n-neighbors", type=int, default=10, help="Neighbors for graph construction.")
p.add_argument("--n-pcs", type=int, default=30, help="Number of PCs for neighbors/UMAP.")
p.add_argument("--resolution", type=float, default=1.0, help="Leiden resolution.")
p.add_argument("--top-loading-genes", type=int, default=30, help="Top genes to report for PC1/PC2 and loading bars.")
args = p.parse_args()
main(args)