Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HVG genes setting through files #130

Merged
merged 2 commits into from
Feb 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions scanpy-scripts-tests.bats
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,11 @@ setup() {
norm_opt="--save-layer filtered -t 10000 -l all -n after -X ${norm_mtx} --show-obj stdout"
norm_obj="${output_dir}/norm.h5ad"
hvg_opt="-m 0.0125 3 -d 0.5 inf -s --show-obj stdout"
always_hvg="${data_dir}/always_hvg.txt"
never_hvg="${data_dir}/never_hvg.txt"
hvg_opt_always_never="--always-hv-genes-file ${always_hvg} --never-hv-genes-file ${never_hvg}"
hvg_obj="${output_dir}/hvg.h5ad"
hvg_obj_on_off="${output_dir}/hvg_on_off.h5ad"
regress_opt="-k n_counts --show-obj stdout"
regress_obj="${output_dir}/regress.h5ad"
scale_opt="--save-layer normalised -m 10 --show-obj stdout"
Expand Down Expand Up @@ -131,6 +135,22 @@ setup() {
[ -f "$raw_matrix_from_raw" ]
}

@test "Add genes to be considered HVGs" {
if [ "$resume" = 'true' ] && [ -f "$always_hvg" ]; then
skip "$always_hvg exists"
fi

run eval "echo -e 'MIR1302-10\nFAM138A' > $always_hvg"
}

@test "Add genes not to be considered HVGs" {
if [ "$resume" = 'true' ] && [ -f "$never_hvg" ]; then
skip "$never_hvg exists"
fi

run eval "echo -e 'ISG15\nTNFRSF4' > $never_hvg"
}

@test "Test MTX write from layers" {
if [ "$resume" = 'true' ] && [ -f "$raw_matrix_from_layer" ]; then
skip "$raw_matrix exists"
Expand Down Expand Up @@ -219,6 +239,14 @@ setup() {
[ -f "$hvg_obj" ]
}

@test "Find variable genes with optional turn on/off lists" {
if [ "$resume" = 'true' ] && [ -f "$hvg_obj_on_off" ]; then
skip "$hvg_obj_on_off exists and resume is set to 'true'"
fi

run rm -f $hvg_obj_on_off && eval "$scanpy hvg $hvg_opt_always_never $norm_obj $hvg_obj_on_off"
}

# Do separate doublet simulation step (normally we'd just let the main scrublet
# process do this).

Expand Down
19 changes: 17 additions & 2 deletions scanpy_scripts/cmd_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
"""

import click

from .click_utils import (
CommaSeparatedText,
Dictionary,
valid_limit,
valid_parameter_limits,
mutually_exclusive_with,
required_by,
valid_limit,
valid_parameter_limits,
)

COMMON_OPTIONS = {
Expand Down Expand Up @@ -856,6 +857,20 @@
"'seurat_v3', ties are broken by the median (across batches) rank based on "
"within-batch normalized variance.",
),
click.option(
"--always-hv-genes-file",
"always_hv_genes_file",
type=click.Path(exists=True),
default=None,
help="If specified, the gene identifers in this file will be set as highly variable in the var dataframe after HVGs are computed.",
),
click.option(
"--never-hv-genes-file",
"never_hv_genes_file",
type=click.Path(exists=True),
default=None,
help="If specified, the gene identifers in this file will be removed from highly variable in the var dataframe (set to false) after HVGs are computed.",
),
],
"scale": [
*COMMON_OPTIONS["input"],
Expand Down
45 changes: 45 additions & 0 deletions scanpy_scripts/lib/_hvg.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,20 @@ def hvg(
if "n_top_genes" in kwargs and kwargs["n_top_genes"] is not None:
kwargs["n_top_genes"] = min(adata.n_vars, kwargs["n_top_genes"])

always_hv_genes = None
if "always_hv_genes_file" in kwargs and kwargs["always_hv_genes_file"] is not None:
with open(kwargs["always_hv_genes_file"], "r") as f:
always_hv_genes = f.read().splitlines()

never_hv_genes = None
if "never_hv_genes_file" in kwargs and kwargs["never_hv_genes_file"] is not None:
with open(kwargs["never_hv_genes_file"], "r") as f:
never_hv_genes = f.read().splitlines()

# to avoid upsetting the scanpy function with unexpected keyword arguments
del kwargs["always_hv_genes_file"]
del kwargs["never_hv_genes_file"]

sc.pp.highly_variable_genes(
adata,
min_mean=mean_limits[0],
Expand All @@ -30,4 +44,35 @@ def hvg(
**kwargs,
)

return switch_hvgs(adata, always_hv_genes, never_hv_genes)


def switch_hvgs(adata, always_hv_genes=None, never_hv_genes=None):
"""
Function to switch on/off highly variable genes based on a list of genes.

>>> adata = sc.datasets.pbmc3k()
>>> sc.pp.normalize_total(adata)
>>> sc.pp.log1p(adata)
>>> sc.pp.highly_variable_genes(adata)
>>> adata = switch_hvgs(adata, always_hv_genes=['MIR1302-10', 'FAM138A'], never_hv_genes=['ISG15', 'TNFRSF4'])
>>> adata.var.loc['ISG15'].highly_variable
False
>>> adata.var.loc['TNFRSF4'].highly_variable
False
>>> adata.var.loc['MIR1302-10'].highly_variable
True
>>> adata.var.loc['CPSF3L'].highly_variable
True
"""
if always_hv_genes is not None:
adata.var.highly_variable = np.logical_or(
adata.var.highly_variable, adata.var_names.isin(always_hv_genes)
)

if never_hv_genes is not None:
adata.var.highly_variable = np.logical_and(
adata.var.highly_variable, ~adata.var_names.isin(never_hv_genes)
)

return adata
Loading