Skip to content

Commit

Permalink
Merge pull request #14 from michaelgmz/main
Browse files Browse the repository at this point in the history
0.2.4
  • Loading branch information
michaelgmz authored Sep 27, 2022
2 parents d3fcee0 + 2a8efbf commit b16146e
Show file tree
Hide file tree
Showing 9 changed files with 257 additions and 125 deletions.
5 changes: 5 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
Release History
===============

Version 0.2.4
-------------
- Support the input of one or multiple genes trends to initialize cell time, see config.py, parameter self.IROOT
- Re-formulate the structure of configuratioin file

Version 0.2.3
-------------
- Change threshold for self.AGENES_R2
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

setup(
name='unitvelo',
version='0.2.3',
version='0.2.4',
# use_scm_version=True,
# setup_requires=['setuptools_scm'],

Expand Down
158 changes: 93 additions & 65 deletions unitvelo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,60 +3,100 @@
#! Don't use this class directly.
#! Instead, sub-class it and override the configurations you need to change.

class Configuration(object):

class Preprocessing(object):
def __init__(self):
# (int) speficy the GPU card for acceleration, default 0
# -1 will switch to CPU mode
self.GPU = 0
self.MIN_SHARED_COUNTS = 20

# (str) relevant path for saving scv plots
# self.FIG_DIR = './figures/'
# (int) # of highly variable genes selected for pre-processing, default 2000
# consider decreasing to 1500 when # of cells > 10k
self.N_TOP_GENES = 2000

# Gaussian Mixture
self.BASE_FUNCTION = 'Gaussian'
self.N_PCS = 30
self.N_NEIGHBORS = 30

# Deterministic Curve Linear
self.GENERAL = 'Curve'
# (bool) use raw un/spliced counts or first order moments
self.USE_RAW = False

# (str) embedding format of adata, e.g. pca, tsne, umap,
# if None (default), algorithm will choose one automatically
self.BASIS = None
# (bool) rescaled Mu/Ms as input based on variance, default True
self.RESCALE_DATA = True

# (int) # of highly variable genes selected for pre-processing, default 2000
# consider decreasing to 1500 when # of cells > 10k
self.N_TOP_GENES = 2000
class Regularization(object):
def __init__(self):
# (bool) regularization on loss function to push peak time away from 0.5
# mainly used in unified time mode for linear phase portraits
self.REG_LOSS = True
# (float) gloablly adjust the magnitude of the penalty, recommend < 0.1
self.REG_TIMES = 0.075
# (float) scaling parameter of the regularizer
self.REG_SCALE = 1

# (str) selection creteria for velocity genes used in RNA velocity construction, default basic
# raws, all highly variable genes specified by self.N_TOP_GENES will be used
# offset, linear regression $R^2$ and coefficient with offset, will override self.R2_ADJUST
# basic, linear regression $R^2$ and coefficient without offset
# [single gene name], fit this designated gene alone, for model validation purpose only
# [list of gene names], manually provide a list of genes as velocity genes in string, might improve performance, see scNT
self.VGENES = 'basic'
# (list of tuples) [(gene1, trend1), (gene2, trend2), (gene3, trend3), ...],
# a list of genes with trend can be one of {increase, decrease}, default None
self.GENE_PRIOR = None
self.GENE_PRIOR_SCALE = 5000

class Optimizer(object):
def __init__(self):
# (float) learning rate of the main optimizer
self.LEARNING_RATE = 1e-2
# (int) maximum iteration rate of main optimizer
self.MAX_ITER = 12000

class FittingOption(object):
def __init__(self):
# Fitting options under Gaussian model
# '1' = Unified-time mode
# '2' = Independent mode
self.FIT_OPTION = '1'

# (str, experimental) methods to aggregate time metrix, default 'SVD'
# Max SVD Raw
self.DENSITY = 'SVD'
# (str) whether to reorder cell based on relative positions for time assignment
# Soft_Reorder (default) Hard (for Independent mode)
self.REORDER_CELL = 'Soft_Reorder'
# (bool) aggregate gene-specific time to cell time during fitting
# controlled by self.FIT_OPTION
self.AGGREGATE_T = True

# (bool, experimental), whether clip negative predictions to 0, default False
self.ASSIGN_POS_U = False

# (bool, experimental) cell time restricted to (0, 1) if False, default False
self.RESCALE_TIME = False

class VelocityGenes(object):
def __init__(self):
# (bool) linear regression $R^2$ on extreme quantile (default) or full data (adjusted)
# valid when self.VGENES = 'basic'
self.R2_ADJUST = True

# (str) selection creteria for velocity genes used in RNA velocity construction, default basic
# 1. raws, all highly variable genes specified by self.N_TOP_GENES will be used
# 2. offset, linear regression $R^2$ and coefficient with offset, will override self.R2_ADJUST
# 3. basic, linear regression $R^2$ and coefficient without offset
# 4. single gene name, fit this designated gene alone, for model validation purpose only
# 5. [list of gene names], manually provide a list of genes as velocity genes in string, might improve performance, see scNT
self.VGENES = 'basic'

# (float) threshold of R2 at later stage of the optimization proces
# to capture the dynamics of more genes beside initially selected velocity genes
# Note: self.AGENES_R2 = 1 will switch to origianl mode with no amplification stage
# Note: self.AGENES_R2 = 1 will switch to origianl mode with no amplification
self.AGENES_R2 = 1
self.AGENES_THRES = 0.61

# (bool, experimental) exclude cell that have 0 expression in either un/spliced when contributing to loss function
self.FILTER_CELLS = False

# (bool, experimental) cell time restricted to (0, 1) if False, default False
self.RESCALE_TIME = False

# (bool) rescaled Mu/Ms as input based on variance, default True
self.RESCALE_DATA = True

class CellInitialization(object):
def __init__(self):
# (str) criteria for cell latent time initialization, default None
# None, initialized based on the exact order of input expression matrix
# gcount, initialized based on gene counts (https://www.science.org/doi/abs/10.1126/science.aax0249)
# [cluster name], use diffusion map based time as initialization
# 1. None, initialized based on the exact order of input expression matrix
# 2. gcount, str, initialized based on gene counts (https://www.science.org/doi/abs/10.1126/science.aax0249)
# 3. cluster name, str, use diffusion map based time as initialization
# 4. [(gene1, trend1), (gene2, trend2), (gene3, trend3), ...], list of tuples,
# a list of genes with trend can be one of {increase, decrease}
self.IROOT = None

# (int) number of random initializations of time points, default 1
Expand All @@ -68,41 +108,29 @@ def __init__(self):
# re_init, reverse the initialization time of first run
self.NUM_REP_TIME = 're_pre'

# Fitting options under Gaussian model
# '1' = Unified-time mode
# '2' = Independent mode
self.FIT_OPTION = '1'
class Configuration():
def __init__(self):
Preprocessing.__init__(self)
Regularization.__init__(self)
Optimizer.__init__(self)
FittingOption.__init__(self)
VelocityGenes.__init__(self)
CellInitialization.__init__(self)

# (str, experimental) methods to aggregate time metrix, default 'SVD'
# Max SVD Raw
self.DENSITY = 'SVD'
# (str) whether to reorder cell based on relative positions for time assignment
# Soft_Reorder (default) Hard (for Independent mode)
self.REORDER_CELL = 'Soft_Reorder'
# (bool) aggregate gene-specific time to cell time during fitting
# controlled by self.FIT_OPTION
self.AGGREGATE_T = True
# (int) speficy the GPU card for acceleration, default 0
# -1 will switch to CPU mode
self.GPU = 0

# (bool, experimental), whether clip negative predictions to 0, default False
self.ASSIGN_POS_U = False
# Gaussian Mixture
self.BASE_FUNCTION = 'Gaussian'

# (bool) regularization on loss function to push peak time away from 0.5
# mainly used in unified time mode for linear phase portraits
self.REG_LOSS = True
# (float) gloablly adjust the magnitude of the penalty, recommend < 0.1
self.REG_TIMES = 0.075
# (float) scaling parameter of the regularizer
self.REG_SCALE = 1
# Deterministic Curve Linear
self.GENERAL = 'Curve'

# (str) embedding format of adata, e.g. pca, tsne, umap,
# if None (default), algorithm will choose one automatically
self.BASIS = None

# (int, experimental) window size for sliding smoothing of distribution with highest probability
# useful when self.DENSITY == 'Max'
# self.WIN_SIZE = 50

# (float) learning rate of the main optimizer
self.LEARNING_RATE = 1e-2

# (int) maximum iteration rate of main optimizer
self.MAX_ITER = 12000

# (bool) use raw un/spliced counts or first order moments
self.USE_RAW = False
# self.WIN_SIZE = 50
86 changes: 50 additions & 36 deletions unitvelo/main.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from .utils import remove_dir, min_max
from .utils import remove_dir
from .velocity import Velocity
import scvelo as scv
import os
import logging
import numpy as np

def run_model(
adata,
label,
config_file=None,
normalize=True
normalize=True,
):
"""Preparation and pre-processing function of RNA velocity calculation.
Expand All @@ -26,39 +27,60 @@ def run_model(
"""

from .config import Configuration
if config_file == None:
print (
f'Model configuration file not specified.\n'
f'Default settings with unified-time mode will be used.'
)
from .config import Configuration
config = Configuration()

else:
config = config_file

if config.FIT_OPTION == '1':
config.DENSITY = 'SVD'
config.DENSITY = 'SVD' if config.GENE_PRIOR == None else 'Raw'
config.REORDER_CELL = 'Soft_Reorder'
config.AGGREGATE_T = True
config.ASSIGN_POS_U = False
config.REG_LOSS = True

elif config.FIT_OPTION == '2':
config.DENSITY = 'Raw'
config.REORDER_CELL = 'Hard'
config.AGGREGATE_T = False
config.AGENES_R2 = 1

else:
raise ValueError('config.FIT_OPTION is invalid')

config.MAX_ITER = config.MAX_ITER if config.MAX_ITER > 12000 else 12000

print ('-------> Model Configuration Settings <-------')
for ix, item in enumerate(vars(config).items()):
print ("%s: %s" % item, end=f'\t') if ix % 3 != 0 \
else print ('\n', "%s: %s" % item, end=f'\t')
print ('-------> Manully Specified Parameter <-------')
config_ref = Configuration()
dict_input, dict_ref = vars(config), vars(config_ref)

para_used = []
for parameter in dict_ref:
if dict_input[parameter] != dict_ref[parameter]:
print (parameter, dict_input[parameter], sep=f':\t')
para_used.append(parameter)

if len(para_used) == 0:
print ('None')

print ('-------> Model Configuration Settings <------')
default_para = ['N_TOP_GENES',
'LEARNING_RATE',
'FIT_OPTION',
'DENSITY',
'REORDER_CELL',
'AGGREGATE_T',
'R2_ADJUST',
'GENE_PRIOR',
'VGENES',
'IROOT']

for parameter in default_para:
if parameter not in para_used:
print (parameter, dict_ref[parameter], sep=f':\t')
print (f'\n')

scv.settings.presenter_view = True
Expand All @@ -81,13 +103,16 @@ def run_model(
data_path = os.path.join(cwd, 'res', 'temp.h5ad')

if normalize:
scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=config.N_TOP_GENES)
scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
scv.pp.filter_and_normalize(adata,
min_shared_counts=config.MIN_SHARED_COUNTS,
n_top_genes=config.N_TOP_GENES)
scv.pp.moments(adata,
n_pcs=config.N_PCS,
n_neighbors=config.N_NEIGHBORS)
else:
scv.pp.neighbors(adata)

remove_dir(data_path, adata)
import logging
logging.basicConfig(filename=os.path.join(adata.uns['temp'], 'logging.txt'),
filemode='a',
format='%(asctime)s, %(levelname)s, %(message)s',
Expand All @@ -100,34 +125,23 @@ def run_model(
adata = adata_second.copy()
adata.obs['latent_time_gm'] = pre_time_gm

if config.IROOT == 'gcount':
adata.obs['gcount'] = np.sum(adata.X.todense() > 0, axis=1)
init_time = 1 - min_max(adata.obs.groupby(label)['gcount'].mean())

for id in list(init_time.index):
adata.obs.loc[adata.obs[label] == id, 'gcount'] = init_time[id]

adata.uns['datapath'] = data_path
adata.uns['label'] = label
adata.uns['base_function'] = config.BASE_FUNCTION

if 'true_alpha' not in adata.var.columns:
if config.BASIS is None:
basis_keys = ["pca", "tsne", "umap"]
basis = [key for key in basis_keys if f"X_{key}" in adata.obsm.keys()][-1]
elif f"X_{config.BASIS}" in adata.obsm.keys():
basis = config.BASIS
else:
raise ValueError('Invalid embedding parameter self.BASIS')
adata.uns['basis'] = basis

if config.BASIS is None:
basis_keys = ["pca", "tsne", "umap"]
basis = [key for key in basis_keys if f"X_{key}" in adata.obsm.keys()][-1]
elif f"X_{config.BASIS}" in adata.obsm.keys():
basis = config.BASIS
else:
basis = None
raise ValueError('Invalid embedding parameter config.BASIS')
adata.uns['basis'] = basis

# if 'scNT' in data_path:
# import pandas as pd
# gene_ids = pd.read_csv('../data/scNT/brie_neuron_splicing_time.tsv', delimiter='\t', index_col='GeneID')
# config.VGENES = list(gene_ids.loc[gene_ids['time_FDR'] < 0.01].index)
if 'scNT' in data_path:
import pandas as pd
gene_ids = pd.read_csv('../data/scNT/brie_neuron_splicing_time.tsv', delimiter='\t', index_col='GeneID')
config.VGENES = list(gene_ids.loc[gene_ids['time_FDR'] < 0.01].index)

model = Velocity(adata, config=config)
model.get_velo_genes()
Expand Down
Loading

0 comments on commit b16146e

Please sign in to comment.