first release version

slipguru · Feb 28, 2017 · de19f99 · de19f99
1 parent cbf0d22
commit de19f99
Show file tree

Hide file tree

Showing 8 changed files with 3,789 additions and 14 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,8 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+.ftpconfig
+
 # C extensions
 *.so
 

diff --git a/examples/config_example.py b/examples/config_example.py
@@ -0,0 +1,52 @@
+"""Example of an ICING configuration file.
+
+Author: Federico Tomasi
+Copyright (c) 2017, Federico Tomasi.
+Licensed under the FreeBSD license (see LICENSE.txt).
+"""
+import os
+current_folder = os.path.dirname(os.path.abspath(__file__))
+
+exp_tag = 'example'
+output_root_folder = 'icing_example_result'
+
+# this can be a list or a single input file
+db_file = [
+    'data/clones_95.tab',
+    # 'data/clones_100.1.tab',
+    # 'data/clones_100.2.tab'
+]
+
+db_file = [os.path.join(current_folder, x) for x in db_file]
+exp_tag = [x.split('/')[-1] for x in db_file]
+
+# type of input, if in excel format or excel-tab
+dialect = "excel-tab"
+
+# Percentual of IG sequences on which to calculate the correction function
+learning_function_quantity = 1
+
+# Parameter of the function for IG similarity
+sim_func_args = {
+    'vj_weight': 0,
+    'sk_weight': 1,
+    # turn the following on to avoid correction function
+    # 'correction_function': lambda x: 1,
+
+    # string kernel parameters.
+    'model': 'sk',
+    'ssk_params': {
+        'min_kn': 3, 'max_kn': 9,
+        'lamda': .25, 'check_min_length': 1
+    },
+    # turn this on to use hamming similarity
+    # 'model': 'ham',
+
+    # clustering: ap or hc
+    'clustering': 'ap',
+
+    # tolerance on HCDR3 length
+    'tol': 6
+}
+# This is ignored with AP clustering
+# threshold = .025
diff --git a/examples/data/clones_459.tab b/examples/data/clones_459.tab
diff --git a/examples/data/clones_91.tab b/examples/data/clones_91.tab
diff --git a/examples/data/clones_95.tab b/examples/data/clones_95.tab
diff --git a/icing/core/cloning.py b/icing/core/cloning.py
@@ -92,8 +92,8 @@ def sim_function(
                 ig1.junc, ig2.junc, ig1.junction_length, ig2.junction_length,
                 dist_mat, dist_mat_max=dist_mat_max, tol=tol)
             similarity += sk_weight * (1 - dist)
-        else:
-            raise ValueError("model '%s' not understood" % model)
+        # else:
+        #     raise ValueError("model '%s' not understood" % model)
 
     if similarity > 0 and correct:
         correction = correction_function(np.mean((ig1.mut, ig2.mut)))
@@ -482,7 +482,7 @@ def define_clusts(similarity_matrix, threshold=0.05, max_iter=200,
                 dists = squareform(1 - sm.toarray())
                 links = fastcluster.linkage(dists, method='ward')
                 try:
-                    clusters_ = fcluster(links, 1 - threshold, 'distance')
+                    clusters_ = fcluster(links, threshold, 'distance')
                 except ValueError as err:
                     logging.critical(err)
                     clusters_ = np.zeros(1, dtype=int)
@@ -552,8 +552,12 @@ def define_clones(db_iter, exp_tag='debug', root=None, method='ap',
     clusters = define_clusts(similarity_matrix, threshold=threshold,
                              method=method)
     n_clones = np.max(clusters) - np.min(clusters) + 1
-    logging.critical("Number of clones: %i, threshold %.3f", n_clones,
-                     threshold)
+    if method.lower() == 'ap':
+        # log only number of clones
+        logging.critical("Number of clones: %i", n_clones)
+    else:
+        logging.critical("Number of clones: %i, threshold %.3f", n_clones,
+                         threshold)
     with open(os.path.join(output_folder, 'summary.txt'), 'w') as f:
         f.write("filename: %s\n" % db_file)
         f.write("clones: %i\n" % n_clones)

diff --git a/icing/core/learning_function.py b/icing/core/learning_function.py
@@ -195,7 +195,7 @@ def intra_donor_distance(db='', lim_mut1=(0, 0), lim_mut2=(0, 0), type_ig='Mem',
         igs2, juncs2 = shuffle_ig(igs2, juncs2, max_seqs)
         mut = np.mean(list(chain((x.mut for x in igs1),
                                  (x.mut for x in igs2))))
-    print(len(juncs1))
+    # logging.info("Computing similarity ")
     return make_hist(
         juncs1, juncs2, filename, lim_mut1, lim_mut2, type_ig, mut,
         donor, None, bins, min_seqs, ig1=igs1, ig2=igs2,
@@ -253,7 +253,7 @@ def distr_muts(db, quantity=0.15, bins=50, max_seqs=4000, min_seqs=100,
     try:
         max_mut, n_tot = io.get_max_mut(db)
         # if max_mut < 1:
-        lin = np.linspace(0, max_mut, min(n_tot / 10., 12))
+        lin = np.linspace(0, max_mut, min(n_tot / 15., 12))
         # lin = np.linspace(0, max_mut, 10.)
         sets = [(0, 0)] + zip(lin[:-1], lin[1:])
         # sets = [(0, 0)] + [(i - 1, i) for i in range(1, int(max_mut) + 1)]
@@ -419,7 +419,7 @@ def learning_function(my_dict, order=3, aplot='alphaplot.pdf'):
         plt.close()
 
     # poly = partial(model, res.x)
-    return poly, (filter(
+    return poly, 1 - (filter(
         lambda x: x > 0,
         np.array(thresholds)[np.array(samples).argsort()[::-1]]) or [0])[0]
 
@@ -440,7 +440,7 @@ def generate_correction_function(db, quantity, sim_func_args=None, order=3,
     # case 2: file not exists
     else:
         my_dict = distr_muts(
-            db, quantity=quantity, min_seqs=4, max_seqs=1000,
+            db, quantity=quantity, min_seqs=10, max_seqs=1000,
             sim_func_args=sim_func_args)
         popt, threshold_naive = learning_function(my_dict, order, aplot)
         # save for later, in case of analysis on the same db

diff --git a/scripts/ici_run.py b/scripts/ici_run.py
@@ -60,8 +60,8 @@ def main(config_file):
         'subsets': (), 'mutation': (0, 0), 'apply_filter': None,
         'max_records': None, 'dialect': 'excel-tab', 'exp_tag': 'debug',
         'output_root_folder': 'results', 'force_silhouette': False,
-        'sim_func_args': {}, 'threshold': 0.0536, 'verbose': False,
-        'learning_function_quantity': 0.15,
+        'sim_func_args': {}, 'threshold': None, 'verbose': False,
+        'learning_function_quantity': 0.3,
         'learning_function_order': 3})
 
     # Define logging file
@@ -84,26 +84,31 @@ def main(config_file):
         logging.info("Database loaded (%i records)", len(db_iter))
 
         local_sim_func_args = config.sim_func_args.copy()
-        alpha_plot = None
+        alpha_plot, threshold = None, None
         if local_sim_func_args.get("correction_function", None) is None:
             record_quantity = np.clip(config.learning_function_quantity, 0, 1)
             logging.info("Generate correction function with %.2f%% of records",
                          record_quantity * 100)
             func_args_copy = local_sim_func_args.copy()
             func_args_copy.pop('clustering', 'ap')
-            (local_sim_func_args['correction_function'], config.threshold,
+            (local_sim_func_args['correction_function'], threshold,
              alpha_plot) = generate_correction_function(
                  db_file, quantity=record_quantity,
                  sim_func_args=func_args_copy,
                  order=config.learning_function_order, root=root)
 
+        if config.threshold is None and threshold is None:
+            # no correction function and no threshold specified in config
+            threshold = .05
+        elif config.threshold is not None:
+            threshold = config.threshold
         logging.info("Start define_clones function ...")
         clustering = local_sim_func_args.pop('clustering', 'ap')
         outfolder, clone_dict = define_clones(
             db_iter, exp_tag=filename, root=root,
             method=clustering,
             sim_func_args=local_sim_func_args,
-            threshold=config.threshold, db_file=db_file)
+            threshold=threshold, db_file=db_file)
 
         try:
             # Copy the config just used in the output folder