Skip to content

Commit

Permalink
first release version
Browse files Browse the repository at this point in the history
  • Loading branch information
fdtomasi committed Feb 28, 2017
1 parent cbf0d22 commit de19f99
Show file tree
Hide file tree
Showing 8 changed files with 3,789 additions and 14 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ __pycache__/
*.py[cod]
*$py.class

.ftpconfig

# C extensions
*.so

Expand Down
52 changes: 52 additions & 0 deletions examples/config_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Example of an ICING configuration file.
Author: Federico Tomasi
Copyright (c) 2017, Federico Tomasi.
Licensed under the FreeBSD license (see LICENSE.txt).
"""
import os
current_folder = os.path.dirname(os.path.abspath(__file__))

exp_tag = 'example'
output_root_folder = 'icing_example_result'

# this can be a list or a single input file
db_file = [
'data/clones_95.tab',
# 'data/clones_100.1.tab',
# 'data/clones_100.2.tab'
]

db_file = [os.path.join(current_folder, x) for x in db_file]
exp_tag = [x.split('/')[-1] for x in db_file]

# type of input, if in excel format or excel-tab
dialect = "excel-tab"

# Percentual of IG sequences on which to calculate the correction function
learning_function_quantity = 1

# Parameter of the function for IG similarity
sim_func_args = {
'vj_weight': 0,
'sk_weight': 1,
# turn the following on to avoid correction function
# 'correction_function': lambda x: 1,

# string kernel parameters.
'model': 'sk',
'ssk_params': {
'min_kn': 3, 'max_kn': 9,
'lamda': .25, 'check_min_length': 1
},
# turn this on to use hamming similarity
# 'model': 'ham',

# clustering: ap or hc
'clustering': 'ap',

# tolerance on HCDR3 length
'tol': 6
}
# This is ignored with AP clustering
# threshold = .025
2,668 changes: 2,668 additions & 0 deletions examples/data/clones_459.tab

Large diffs are not rendered by default.

500 changes: 500 additions & 0 deletions examples/data/clones_91.tab

Large diffs are not rendered by default.

544 changes: 544 additions & 0 deletions examples/data/clones_95.tab

Large diffs are not rendered by default.

14 changes: 9 additions & 5 deletions icing/core/cloning.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ def sim_function(
ig1.junc, ig2.junc, ig1.junction_length, ig2.junction_length,
dist_mat, dist_mat_max=dist_mat_max, tol=tol)
similarity += sk_weight * (1 - dist)
else:
raise ValueError("model '%s' not understood" % model)
# else:
# raise ValueError("model '%s' not understood" % model)

if similarity > 0 and correct:
correction = correction_function(np.mean((ig1.mut, ig2.mut)))
Expand Down Expand Up @@ -482,7 +482,7 @@ def define_clusts(similarity_matrix, threshold=0.05, max_iter=200,
dists = squareform(1 - sm.toarray())
links = fastcluster.linkage(dists, method='ward')
try:
clusters_ = fcluster(links, 1 - threshold, 'distance')
clusters_ = fcluster(links, threshold, 'distance')
except ValueError as err:
logging.critical(err)
clusters_ = np.zeros(1, dtype=int)
Expand Down Expand Up @@ -552,8 +552,12 @@ def define_clones(db_iter, exp_tag='debug', root=None, method='ap',
clusters = define_clusts(similarity_matrix, threshold=threshold,
method=method)
n_clones = np.max(clusters) - np.min(clusters) + 1
logging.critical("Number of clones: %i, threshold %.3f", n_clones,
threshold)
if method.lower() == 'ap':
# log only number of clones
logging.critical("Number of clones: %i", n_clones)
else:
logging.critical("Number of clones: %i, threshold %.3f", n_clones,
threshold)
with open(os.path.join(output_folder, 'summary.txt'), 'w') as f:
f.write("filename: %s\n" % db_file)
f.write("clones: %i\n" % n_clones)
Expand Down
8 changes: 4 additions & 4 deletions icing/core/learning_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def intra_donor_distance(db='', lim_mut1=(0, 0), lim_mut2=(0, 0), type_ig='Mem',
igs2, juncs2 = shuffle_ig(igs2, juncs2, max_seqs)
mut = np.mean(list(chain((x.mut for x in igs1),
(x.mut for x in igs2))))
print(len(juncs1))
# logging.info("Computing similarity ")
return make_hist(
juncs1, juncs2, filename, lim_mut1, lim_mut2, type_ig, mut,
donor, None, bins, min_seqs, ig1=igs1, ig2=igs2,
Expand Down Expand Up @@ -253,7 +253,7 @@ def distr_muts(db, quantity=0.15, bins=50, max_seqs=4000, min_seqs=100,
try:
max_mut, n_tot = io.get_max_mut(db)
# if max_mut < 1:
lin = np.linspace(0, max_mut, min(n_tot / 10., 12))
lin = np.linspace(0, max_mut, min(n_tot / 15., 12))
# lin = np.linspace(0, max_mut, 10.)
sets = [(0, 0)] + zip(lin[:-1], lin[1:])
# sets = [(0, 0)] + [(i - 1, i) for i in range(1, int(max_mut) + 1)]
Expand Down Expand Up @@ -419,7 +419,7 @@ def learning_function(my_dict, order=3, aplot='alphaplot.pdf'):
plt.close()

# poly = partial(model, res.x)
return poly, (filter(
return poly, 1 - (filter(
lambda x: x > 0,
np.array(thresholds)[np.array(samples).argsort()[::-1]]) or [0])[0]

Expand All @@ -440,7 +440,7 @@ def generate_correction_function(db, quantity, sim_func_args=None, order=3,
# case 2: file not exists
else:
my_dict = distr_muts(
db, quantity=quantity, min_seqs=4, max_seqs=1000,
db, quantity=quantity, min_seqs=10, max_seqs=1000,
sim_func_args=sim_func_args)
popt, threshold_naive = learning_function(my_dict, order, aplot)
# save for later, in case of analysis on the same db
Expand Down
15 changes: 10 additions & 5 deletions scripts/ici_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ def main(config_file):
'subsets': (), 'mutation': (0, 0), 'apply_filter': None,
'max_records': None, 'dialect': 'excel-tab', 'exp_tag': 'debug',
'output_root_folder': 'results', 'force_silhouette': False,
'sim_func_args': {}, 'threshold': 0.0536, 'verbose': False,
'learning_function_quantity': 0.15,
'sim_func_args': {}, 'threshold': None, 'verbose': False,
'learning_function_quantity': 0.3,
'learning_function_order': 3})

# Define logging file
Expand All @@ -84,26 +84,31 @@ def main(config_file):
logging.info("Database loaded (%i records)", len(db_iter))

local_sim_func_args = config.sim_func_args.copy()
alpha_plot = None
alpha_plot, threshold = None, None
if local_sim_func_args.get("correction_function", None) is None:
record_quantity = np.clip(config.learning_function_quantity, 0, 1)
logging.info("Generate correction function with %.2f%% of records",
record_quantity * 100)
func_args_copy = local_sim_func_args.copy()
func_args_copy.pop('clustering', 'ap')
(local_sim_func_args['correction_function'], config.threshold,
(local_sim_func_args['correction_function'], threshold,
alpha_plot) = generate_correction_function(
db_file, quantity=record_quantity,
sim_func_args=func_args_copy,
order=config.learning_function_order, root=root)

if config.threshold is None and threshold is None:
# no correction function and no threshold specified in config
threshold = .05
elif config.threshold is not None:
threshold = config.threshold
logging.info("Start define_clones function ...")
clustering = local_sim_func_args.pop('clustering', 'ap')
outfolder, clone_dict = define_clones(
db_iter, exp_tag=filename, root=root,
method=clustering,
sim_func_args=local_sim_func_args,
threshold=config.threshold, db_file=db_file)
threshold=threshold, db_file=db_file)

try:
# Copy the config just used in the output folder
Expand Down

0 comments on commit de19f99

Please sign in to comment.