diff --git a/sistr/sistr_cmd.py b/sistr/sistr_cmd.py index 7a397b9..31a626d 100644 --- a/sistr/sistr_cmd.py +++ b/sistr/sistr_cmd.py @@ -186,14 +186,14 @@ def infer_o_antigen(prediction): else: counter_o_antigens = Counter(series_o_antigens) most_common_o_antigen = counter_o_antigens.most_common(1)[0][0] - # for O24 and O25 antigens need to remove those antigens as we do not doe any testing in the lab + # for O24 and O25 antigens need to remove those antigens as we do not any wet-lab testing if any([True if antigen in most_common_o_antigen else False for antigen in ['24','25'] ]): - logging.info(f"Cleaning O antigen from 24 and 25 antigens {most_common_o_antigen} ....") + logging.info(f"Cleaning, simplifying O antigen as 24 or 25 antigen found ...") for pattern in [r",\[24\]", r",\[25\]", r",24", r",25",r"\[1\],",r"1,"]: most_common_o_antigen = re.sub(pattern,'',most_common_o_antigen) logging.info(f"Reporting final O-antigen result {most_common_o_antigen}") prediction.o_antigen = most_common_o_antigen - prediction.antigenic_profile=f"{prediction.o_antigen}:{prediction.h1}:{prediction.h2}" + prediction.antigenic_formula=f"{prediction.o_antigen}:{prediction.h1}:{prediction.h2}" def download_to_file(url,file): with open(file, 'wb') as f: @@ -245,7 +245,7 @@ def setup_sistr_dbs(): logging.info(f"Downloading databases successful installed at {os.path.abspath(resource_filename('sistr', 'data/'))}") f = open(resource_filename('sistr', 'dbstatus.txt'),'w') f.write("DB downloaded on : {} from {}".format(datetime.today().strftime('%Y-%m-%d'),SISTR_DB_URL)) - logging.info(f"{f.name}") + logging.info(f"DB status file written at {f.name} path") f.close() @@ -257,7 +257,6 @@ def setup_sistr_dbs(): def sistr_predict(input_fasta, genome_name, tmp_dir, keep_tmp, args): blast_runner = None serovars_selected_list = [] - print(args) if args.list_of_serovars: if os.path.exists(args.list_of_serovars): with open(args.list_of_serovars) as fp: @@ -295,26 +294,21 @@ def sistr_predict(input_fasta, genome_name, tmp_dir, keep_tmp, args): prediction = serovar_predictor.get_serovar_prediction() prediction.genome = genome_name prediction.fasta_filepath = os.path.abspath(input_fasta) - print(f"sistr_cmd.py L271: Antigen serovar prediction: {prediction.serovar} and {serovar_predictor.serovar}") + if cgmlst_prediction: merge_cgmlst_prediction(prediction, cgmlst_prediction) - print(f"sistr_cmd.py L275: add cgmlst_prediction_serovar: {prediction.serovar}"); if mash_prediction: merge_mash_prediction(prediction, mash_prediction) - print(f"sistr_cmd.py L278: add mash_prediction_serovar: {prediction.serovar}"); overall_serovar_call(prediction, serovar_predictor) - print(f"sistr_cmd.py L280: overall_serovar: {prediction.serovar}"); #raise Exception; infer_o_antigen(prediction) # if list of reportable serovars is provided to check prediction serovar against if serovars_selected_list: prediction.serovar_in_list = "N" for selected_serovar in serovars_selected_list: if selected_serovar == prediction.serovar: - print(selected_serovar, prediction.serovar) prediction.serovar_in_list = "Y" break - print(f"L288: sistr_cmd.py antigenic_formula: {prediction.antigenic_profile}") logging.info('%s | Antigen gene BLAST serovar prediction: "%s" serogroup=%s %s:%s:%s', genome_name, prediction.serovar_antigen, diff --git a/sistr/src/serovar_prediction/__init__.py b/sistr/src/serovar_prediction/__init__.py index cc9e2f6..fe30e03 100644 --- a/sistr/src/serovar_prediction/__init__.py +++ b/sistr/src/serovar_prediction/__init__.py @@ -363,7 +363,6 @@ def predict_antigens(self): @staticmethod def get_serovar(df, sg, h1, h2, spp): - #print(df.query(f'H1=="z"')) h2_is_missing = '-' in h2 b_sg = df['Serogroup'].isin(sg) b_h1 = df['H1'].isin(h1) @@ -445,9 +444,7 @@ def predict_serovar_from_antigen_blast(self): if not isinstance(h2, list): h2 = [h2] - print(f"L444 sistr/src/serovar_prediction/__init__.py predict_serovar_from_antigen_blast() self.serovar = {self.serovar} self.subspecies = {self.subspecies}") self.serovar = SerovarPredictor.get_serovar(df, sg, h1, h2, self.subspecies) - print(f"L446 sistr/src/serovar_prediction/__init__.py predict_serovar_from_antigen_blast() self.serovar = {self.serovar}") if self.serovar is None: try: spp_roman = spp_name_to_roman[self.subspecies] @@ -518,9 +515,6 @@ def overall_serovar_call(serovar_prediction, antigen_predictor): """ assert isinstance(serovar_prediction, SerovarPrediction) assert isinstance(antigen_predictor, SerovarPredictor) - print(f"serovar_prediction: {serovar_prediction.__dict__}") - print(f"antigen_predictor: {antigen_predictor.__dict__}") - print(f"sistr/src/serovar_prediction/__init__.py L518: {serovar_prediction.serovar}") h1 = antigen_predictor.h1 h2 = antigen_predictor.h2 @@ -533,7 +527,6 @@ def overall_serovar_call(serovar_prediction, antigen_predictor): serovar_prediction.serovar_antigen = antigen_predictor.serovar #assign antigen serovar from antigen_predictor object cgmlst_serovar = serovar_prediction.serovar_cgmlst cgmlst_distance = float(serovar_prediction.cgmlst_distance) - print(f"sistr/src/serovar_prediction/__init__.py L530: serovar_prediction.serovar_antigen ={serovar_prediction.serovar_antigen} cgmlst_serovar = {serovar_prediction.serovar_cgmlst} mash_serovar = {serovar_prediction.mash_serovar}") h1_h2_share_group = False for h2_groups in H2_FLJB_SIMILARITY_GROUPS: @@ -583,11 +576,8 @@ def overall_serovar_call(serovar_prediction, antigen_predictor): serovar_prediction.h1_flic_prediction.h1 = h1 - print(f"sistr/src/serovar_prediction/__init__.py L582: antigen_predictor.serovar = {antigen_predictor.serovar}") antigen_predictor.predict_serovar_from_antigen_blast() - print(f"sistr/src/serovar_prediction/__init__.py L584: antigen_predictor.serovar = {antigen_predictor.serovar}") serovar_prediction.serovar_antigen = antigen_predictor.serovar - print(f"sistr/src/serovar_prediction/__init__.py L584: serovar_prediction.serovar_antigen ={serovar_prediction.serovar_antigen}") null_result = '-:-:-' @@ -607,7 +597,9 @@ def overall_serovar_call(serovar_prediction, antigen_predictor): serovar_prediction.serovar = '{} {}:{}:{}'.format(spp_roman, sg, h1, h2) else: serovar_prediction.serovar = '{}:{}:{}'.format(spp_roman, sg, h1, h2) + logging.info(f"Overall serovar assigned by antigen alleles serovar {serovar_prediction.serovar}") elif cgmlst_serovar is not None and cgmlst_distance <= CGMLST_DISTANCE_THRESHOLD: + logging.info(f"Overall serovar assigned by cgMLST serovar {cgmlst_serovar} ...") serovar_prediction.serovar = cgmlst_serovar else: serovar_prediction.serovar = null_result @@ -615,6 +607,7 @@ def overall_serovar_call(serovar_prediction, antigen_predictor): spd = serovar_prediction.__dict__ mash_dist = float(spd['mash_distance']) if mash_dist <= MASH_DISTANCE_THRESHOLD: + logging.info(f"Overall serovar assigned by MASH serovar: {spd['mash_distance']} ...") serovar_prediction.serovar = spd['mash_serovar'] else: serovars_from_antigen = antigen_predictor.serovar.split('|') @@ -622,7 +615,7 @@ def overall_serovar_call(serovar_prediction, antigen_predictor): serovars_from_antigen = [serovars_from_antigen] if cgmlst_serovar is not None: if cgmlst_serovar in serovars_from_antigen: - logging.info(f"Antigen predictor has multiple serovar results {antigen_predictor.serovar}, but assigned final cgmlst serovar {cgmlst_serovar} ...") + logging.info(f"Overall serovar assigned by cgMLST serovar {cgmlst_serovar} ...") serovar_prediction.serovar = cgmlst_serovar elif 'mash_match' in serovar_prediction.__dict__: @@ -631,16 +624,16 @@ def overall_serovar_call(serovar_prediction, antigen_predictor): mash_dist = float(spd['mash_distance']) if mash_serovar in serovars_from_antigen: serovar_prediction.serovar = mash_serovar - logging.info(f"Antigen predictor has a serovar result {antigen_predictor.serovar}, but assigned final mash serovar {mash_serovar} ...") + logging.info(f"Overall serovar assigned by MASH serovar {mash_serovar} ...") else: if mash_dist <= MASH_DISTANCE_THRESHOLD: serovar_prediction.serovar = mash_serovar - logging.info(f"Antigen predictor has a serovar result {antigen_predictor.serovar}, but assigned final mash serovar {mash_serovar} ...") + logging.info(f"Overall serovar assigned by MASH serovar {mash_serovar} ...") else: - logging.info(f"MASH serovar prediction was NOT assigned as mash distance {mash_dist} > {MASH_DISTANCE_THRESHOLD} ") + logging.info(f"Overall serovar NOT assigned by MASH serovar as closest ref. genome MASH distance {mash_dist} > {MASH_DISTANCE_THRESHOLD} ") if serovar_prediction.serovar is None: - logging.info(f"Antigen predictor has a serovar result {antigen_predictor.serovar} and it will be assigned as a final serovar ...") + logging.info(f"Overall serovar assigned by antigen alleles serovar: {antigen_predictor.serovar}") serovar_prediction.serovar = serovar_prediction.serovar_antigen if serovar_prediction.h1 is None: