Merge pull request #14 from michaelgmz/main

0.2.4
StatBiomed · Sep 27, 2022 · b16146e · b16146e
2 parents d3fcee0 + 2a8efbf
commit b16146e
Show file tree

Hide file tree

Showing 9 changed files with 257 additions and 125 deletions.
diff --git a/docs/release.rst b/docs/release.rst
@@ -1,6 +1,11 @@
 Release History
 ===============
 
+Version 0.2.4
+-------------
+- Support the input of one or multiple genes trends to initialize cell time, see config.py, parameter self.IROOT
+- Re-formulate the structure of configuratioin file 
+
 Version 0.2.3
 -------------
 - Change threshold for self.AGENES_R2

diff --git a/setup.py b/setup.py
@@ -30,7 +30,7 @@
 
 setup(
     name='unitvelo',
-    version='0.2.3',
+    version='0.2.4',
     # use_scm_version=True,
     # setup_requires=['setuptools_scm'],
 

diff --git a/unitvelo/config.py b/unitvelo/config.py
@@ -3,60 +3,100 @@
 #! Don't use this class directly. 
 #! Instead, sub-class it and override the configurations you need to change.
 
-class Configuration(object):
+
+class Preprocessing(object):
     def __init__(self):
-        # (int) speficy the GPU card for acceleration, default 0
-        # -1 will switch to CPU mode
-        self.GPU = 0
+        self.MIN_SHARED_COUNTS = 20
 
-        # (str) relevant path for saving scv plots
-        # self.FIG_DIR = './figures/'
+        # (int) # of highly variable genes selected for pre-processing, default 2000
+        # consider decreasing to 1500 when # of cells > 10k
+        self.N_TOP_GENES = 2000
 
-        # Gaussian Mixture
-        self.BASE_FUNCTION = 'Gaussian'
+        self.N_PCS = 30
+        self.N_NEIGHBORS = 30
 
-        # Deterministic Curve Linear
-        self.GENERAL = 'Curve'
+        # (bool) use raw un/spliced counts or first order moments
+        self.USE_RAW = False
 
-        # (str) embedding format of adata, e.g. pca, tsne, umap, 
-        # if None (default), algorithm will choose one automatically
-        self.BASIS = None
+        # (bool) rescaled Mu/Ms as input based on variance, default True 
+        self.RESCALE_DATA = True
 
-        # (int) # of highly variable genes selected for pre-processing, default 2000
-        # consider decreasing to 1500 when # of cells > 10k
-        self.N_TOP_GENES = 2000
+class Regularization(object):
+    def __init__(self):
+        # (bool) regularization on loss function to push peak time away from 0.5
+        # mainly used in unified time mode for linear phase portraits
+        self.REG_LOSS = True
+        # (float) gloablly adjust the magnitude of the penalty, recommend < 0.1
+        self.REG_TIMES = 0.075
+        # (float) scaling parameter of the regularizer
+        self.REG_SCALE = 1
 
-        # (str) selection creteria for velocity genes used in RNA velocity construction, default basic
-        # raws, all highly variable genes specified by self.N_TOP_GENES will be used
-        # offset, linear regression $R^2$ and coefficient with offset, will override self.R2_ADJUST
-        # basic, linear regression $R^2$ and coefficient without offset
-        # [single gene name], fit this designated gene alone, for model validation purpose only
-        # [list of gene names], manually provide a list of genes as velocity genes in string, might improve performance, see scNT
-        self.VGENES = 'basic'
+        # (list of tuples) [(gene1, trend1), (gene2, trend2), (gene3, trend3), ...], 
+        # a list of genes with trend can be one of {increase, decrease}, default None
+        self.GENE_PRIOR = None
+        self.GENE_PRIOR_SCALE = 5000
+
+class Optimizer(object):
+    def __init__(self):
+        # (float) learning rate of the main optimizer
+        self.LEARNING_RATE = 1e-2
+        # (int) maximum iteration rate of main optimizer
+        self.MAX_ITER = 12000
+
+class FittingOption(object):
+    def __init__(self):
+        # Fitting options under Gaussian model 
+        # '1' = Unified-time mode 
+        # '2' = Independent mode
+        self.FIT_OPTION = '1'
 
+        # (str, experimental) methods to aggregate time metrix, default 'SVD'
+        # Max SVD Raw
+        self.DENSITY = 'SVD'
+        # (str) whether to reorder cell based on relative positions for time assignment
+        # Soft_Reorder (default) Hard (for Independent mode)
+        self.REORDER_CELL = 'Soft_Reorder'
+        # (bool) aggregate gene-specific time to cell time during fitting
+        # controlled by self.FIT_OPTION
+        self.AGGREGATE_T = True
+
+        # (bool, experimental), whether clip negative predictions to 0, default False
+        self.ASSIGN_POS_U = False
+
+        # (bool, experimental) cell time restricted to (0, 1) if False, default False
+        self.RESCALE_TIME = False
+
+class VelocityGenes(object):
+    def __init__(self):
         # (bool) linear regression $R^2$ on extreme quantile (default) or full data (adjusted)
         # valid when self.VGENES = 'basic'
         self.R2_ADJUST = True
 
+        # (str) selection creteria for velocity genes used in RNA velocity construction, default basic
+        # 1. raws, all highly variable genes specified by self.N_TOP_GENES will be used
+        # 2. offset, linear regression $R^2$ and coefficient with offset, will override self.R2_ADJUST
+        # 3. basic, linear regression $R^2$ and coefficient without offset
+        # 4. single gene name, fit this designated gene alone, for model validation purpose only
+        # 5. [list of gene names], manually provide a list of genes as velocity genes in string, might improve performance, see scNT
+        self.VGENES = 'basic'
+
         # (float) threshold of R2 at later stage of the optimization proces
         # to capture the dynamics of more genes beside initially selected velocity genes
-        # Note: self.AGENES_R2 = 1 will switch to origianl mode with no amplification stage
+        # Note: self.AGENES_R2 = 1 will switch to origianl mode with no amplification
         self.AGENES_R2 = 1
         self.AGENES_THRES = 0.61
 
         # (bool, experimental) exclude cell that have 0 expression in either un/spliced when contributing to loss function
         self.FILTER_CELLS = False
 
-        # (bool, experimental) cell time restricted to (0, 1) if False, default False
-        self.RESCALE_TIME = False
-
-        # (bool) rescaled Mu/Ms as input based on variance, default True 
-        self.RESCALE_DATA = True
-
+class CellInitialization(object):
+    def __init__(self):
         # (str) criteria for cell latent time initialization, default None
-        # None, initialized based on the exact order of input expression matrix
-        # gcount, initialized based on gene counts (https://www.science.org/doi/abs/10.1126/science.aax0249)
-        # [cluster name], use diffusion map based time as initialization
+        # 1. None, initialized based on the exact order of input expression matrix
+        # 2. gcount, str, initialized based on gene counts (https://www.science.org/doi/abs/10.1126/science.aax0249)
+        # 3. cluster name, str, use diffusion map based time as initialization
+        # 4. [(gene1, trend1), (gene2, trend2), (gene3, trend3), ...], list of tuples, 
+        #    a list of genes with trend can be one of {increase, decrease} 
         self.IROOT = None
 
         # (int) number of random initializations of time points, default 1
@@ -68,41 +108,29 @@ def __init__(self):
         # re_init, reverse the initialization time of first run
         self.NUM_REP_TIME = 're_pre'
 
-        # Fitting options under Gaussian model 
-        # '1' = Unified-time mode 
-        # '2' = Independent mode
-        self.FIT_OPTION = '1'
+class Configuration():
+    def __init__(self):
+        Preprocessing.__init__(self)
+        Regularization.__init__(self)
+        Optimizer.__init__(self)
+        FittingOption.__init__(self)
+        VelocityGenes.__init__(self)
+        CellInitialization.__init__(self)
 
-        # (str, experimental) methods to aggregate time metrix, default 'SVD'
-        # Max SVD Raw
-        self.DENSITY = 'SVD'
-        # (str) whether to reorder cell based on relative positions for time assignment
-        # Soft_Reorder (default) Hard (for Independent mode)
-        self.REORDER_CELL = 'Soft_Reorder'
-        # (bool) aggregate gene-specific time to cell time during fitting
-        # controlled by self.FIT_OPTION
-        self.AGGREGATE_T = True
+        # (int) speficy the GPU card for acceleration, default 0
+        # -1 will switch to CPU mode
+        self.GPU = 0
 
-        # (bool, experimental), whether clip negative predictions to 0, default False
-        self.ASSIGN_POS_U = False
+        # Gaussian Mixture
+        self.BASE_FUNCTION = 'Gaussian'
 
-        # (bool) regularization on loss function to push peak time away from 0.5
-        # mainly used in unified time mode for linear phase portraits
-        self.REG_LOSS = True
-        # (float) gloablly adjust the magnitude of the penalty, recommend < 0.1
-        self.REG_TIMES = 0.075
-        # (float) scaling parameter of the regularizer
-        self.REG_SCALE = 1
+        # Deterministic Curve Linear
+        self.GENERAL = 'Curve'
+
+        # (str) embedding format of adata, e.g. pca, tsne, umap, 
+        # if None (default), algorithm will choose one automatically
+        self.BASIS = None
 
         # (int, experimental) window size for sliding smoothing of distribution with highest probability
         # useful when self.DENSITY == 'Max'
-        # self.WIN_SIZE = 50
-
-        # (float) learning rate of the main optimizer
-        self.LEARNING_RATE = 1e-2
-
-        # (int) maximum iteration rate of main optimizer
-        self.MAX_ITER = 12000
-
-        # (bool) use raw un/spliced counts or first order moments
-        self.USE_RAW = False
+        # self.WIN_SIZE = 50
diff --git a/unitvelo/main.py b/unitvelo/main.py
@@ -1,14 +1,15 @@
-from .utils import remove_dir, min_max
+from .utils import remove_dir
 from .velocity import Velocity
 import scvelo as scv
 import os
+import logging
 import numpy as np
 
 def run_model(
     adata,
     label,
     config_file=None,
-    normalize=True
+    normalize=True,
 ):
     """Preparation and pre-processing function of RNA velocity calculation.
     
@@ -26,39 +27,60 @@ def run_model(
     
     """
 
+    from .config import Configuration
     if config_file == None:
         print (
             f'Model configuration file not specified.\n'
             f'Default settings with unified-time mode will be used.'
         )
-        from .config import Configuration
         config = Configuration()
 
     else:
         config = config_file
 
     if config.FIT_OPTION == '1':
-        config.DENSITY = 'SVD'
+        config.DENSITY = 'SVD' if config.GENE_PRIOR == None else 'Raw'
         config.REORDER_CELL = 'Soft_Reorder'
         config.AGGREGATE_T = True
-        config.ASSIGN_POS_U = False
-        config.REG_LOSS = True
 
     elif config.FIT_OPTION == '2':
         config.DENSITY = 'Raw'
         config.REORDER_CELL = 'Hard'
         config.AGGREGATE_T = False
-        config.AGENES_R2 = 1
 
     else:
         raise ValueError('config.FIT_OPTION is invalid')
 
     config.MAX_ITER = config.MAX_ITER if config.MAX_ITER > 12000 else 12000
 
-    print ('-------> Model Configuration Settings <-------')
-    for ix, item in enumerate(vars(config).items()):
-        print ("%s: %s" % item, end=f'\t') if ix % 3 != 0 \
-            else print ('\n', "%s: %s" % item, end=f'\t')
+    print ('-------> Manully Specified Parameter <-------')
+    config_ref = Configuration()
+    dict_input, dict_ref = vars(config), vars(config_ref)
+
+    para_used = []
+    for parameter in dict_ref:
+        if dict_input[parameter] != dict_ref[parameter]:
+            print (parameter, dict_input[parameter], sep=f':\t')
+            para_used.append(parameter)
+
+    if len(para_used) == 0:
+        print ('None')
+
+    print ('-------> Model Configuration Settings <------')
+    default_para = ['N_TOP_GENES', 
+                    'LEARNING_RATE', 
+                    'FIT_OPTION', 
+                    'DENSITY', 
+                    'REORDER_CELL', 
+                    'AGGREGATE_T', 
+                    'R2_ADJUST', 
+                    'GENE_PRIOR', 
+                    'VGENES', 
+                    'IROOT']
+
+    for parameter in default_para:
+        if parameter not in para_used:
+            print (parameter, dict_ref[parameter], sep=f':\t')
     print (f'\n')
 
     scv.settings.presenter_view = True
@@ -81,13 +103,16 @@ def run_model(
         data_path = os.path.join(cwd, 'res', 'temp.h5ad')
 
     if normalize:
-        scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=config.N_TOP_GENES)
-        scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
+        scv.pp.filter_and_normalize(adata, 
+                                    min_shared_counts=config.MIN_SHARED_COUNTS, 
+                                    n_top_genes=config.N_TOP_GENES)
+        scv.pp.moments(adata, 
+                        n_pcs=config.N_PCS, 
+                        n_neighbors=config.N_NEIGHBORS)
     else:
         scv.pp.neighbors(adata)
 
     remove_dir(data_path, adata)
-    import logging
     logging.basicConfig(filename=os.path.join(adata.uns['temp'], 'logging.txt'),
                         filemode='a',
                         format='%(asctime)s, %(levelname)s, %(message)s',
@@ -100,34 +125,23 @@ def run_model(
             adata = adata_second.copy()
             adata.obs['latent_time_gm'] = pre_time_gm
 
-        if config.IROOT == 'gcount':
-            adata.obs['gcount'] = np.sum(adata.X.todense() > 0, axis=1)
-            init_time = 1 - min_max(adata.obs.groupby(label)['gcount'].mean())
-
-            for id in list(init_time.index):
-                adata.obs.loc[adata.obs[label] == id, 'gcount'] = init_time[id]
-
         adata.uns['datapath'] = data_path
         adata.uns['label'] = label
         adata.uns['base_function'] = config.BASE_FUNCTION
 
-        if 'true_alpha' not in adata.var.columns:
-            if config.BASIS is None:
-                basis_keys = ["pca", "tsne", "umap"]
-                basis = [key for key in basis_keys if f"X_{key}" in adata.obsm.keys()][-1]
-            elif f"X_{config.BASIS}" in adata.obsm.keys():
-                basis = config.BASIS
-            else:
-                raise ValueError('Invalid embedding parameter self.BASIS')
-            adata.uns['basis'] = basis
-
+        if config.BASIS is None:
+            basis_keys = ["pca", "tsne", "umap"]
+            basis = [key for key in basis_keys if f"X_{key}" in adata.obsm.keys()][-1]
+        elif f"X_{config.BASIS}" in adata.obsm.keys():
+            basis = config.BASIS
         else:
-            basis = None
+            raise ValueError('Invalid embedding parameter config.BASIS')
+        adata.uns['basis'] = basis
 
-        # if 'scNT' in data_path:
-        #     import pandas as pd
-        #     gene_ids = pd.read_csv('../data/scNT/brie_neuron_splicing_time.tsv', delimiter='\t', index_col='GeneID')
-        #     config.VGENES = list(gene_ids.loc[gene_ids['time_FDR'] < 0.01].index)
+        if 'scNT' in data_path:
+            import pandas as pd
+            gene_ids = pd.read_csv('../data/scNT/brie_neuron_splicing_time.tsv', delimiter='\t', index_col='GeneID')
+            config.VGENES = list(gene_ids.loc[gene_ids['time_FDR'] < 0.01].index)
 
         model = Velocity(adata, config=config)
         model.get_velo_genes()