Skip to content

Commit

Permalink
pre-release v1.1.3 commit tested against test dataset with O24 and O2…
Browse files Browse the repository at this point in the history
…5 antigens from PulseNet and EnteroBase
  • Loading branch information
kbessonov1984 committed Sep 4, 2024
1 parent e43911a commit 0d31550
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 26 deletions.
16 changes: 5 additions & 11 deletions sistr/sistr_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,14 +186,14 @@ def infer_o_antigen(prediction):
else:
counter_o_antigens = Counter(series_o_antigens)
most_common_o_antigen = counter_o_antigens.most_common(1)[0][0]
# for O24 and O25 antigens need to remove those antigens as we do not doe any testing in the lab
# for O24 and O25 antigens need to remove those antigens as we do not any wet-lab testing
if any([True if antigen in most_common_o_antigen else False for antigen in ['24','25'] ]):
logging.info(f"Cleaning O antigen from 24 and 25 antigens {most_common_o_antigen} ....")
logging.info(f"Cleaning, simplifying O antigen as 24 or 25 antigen found ...")
for pattern in [r",\[24\]", r",\[25\]", r",24", r",25",r"\[1\],",r"1,"]:
most_common_o_antigen = re.sub(pattern,'',most_common_o_antigen)
logging.info(f"Reporting final O-antigen result {most_common_o_antigen}")
prediction.o_antigen = most_common_o_antigen
prediction.antigenic_profile=f"{prediction.o_antigen}:{prediction.h1}:{prediction.h2}"
prediction.antigenic_formula=f"{prediction.o_antigen}:{prediction.h1}:{prediction.h2}"

def download_to_file(url,file):
with open(file, 'wb') as f:
Expand Down Expand Up @@ -245,7 +245,7 @@ def setup_sistr_dbs():
logging.info(f"Downloading databases successful installed at {os.path.abspath(resource_filename('sistr', 'data/'))}")
f = open(resource_filename('sistr', 'dbstatus.txt'),'w')
f.write("DB downloaded on : {} from {}".format(datetime.today().strftime('%Y-%m-%d'),SISTR_DB_URL))
logging.info(f"{f.name}")
logging.info(f"DB status file written at {f.name} path")
f.close()


Expand All @@ -257,7 +257,6 @@ def setup_sistr_dbs():
def sistr_predict(input_fasta, genome_name, tmp_dir, keep_tmp, args):
blast_runner = None
serovars_selected_list = []
print(args)
if args.list_of_serovars:
if os.path.exists(args.list_of_serovars):
with open(args.list_of_serovars) as fp:
Expand Down Expand Up @@ -295,26 +294,21 @@ def sistr_predict(input_fasta, genome_name, tmp_dir, keep_tmp, args):
prediction = serovar_predictor.get_serovar_prediction()
prediction.genome = genome_name
prediction.fasta_filepath = os.path.abspath(input_fasta)
print(f"sistr_cmd.py L271: Antigen serovar prediction: {prediction.serovar} and {serovar_predictor.serovar}")

if cgmlst_prediction:
merge_cgmlst_prediction(prediction, cgmlst_prediction)
print(f"sistr_cmd.py L275: add cgmlst_prediction_serovar: {prediction.serovar}");
if mash_prediction:
merge_mash_prediction(prediction, mash_prediction)
print(f"sistr_cmd.py L278: add mash_prediction_serovar: {prediction.serovar}");
overall_serovar_call(prediction, serovar_predictor)
print(f"sistr_cmd.py L280: overall_serovar: {prediction.serovar}"); #raise Exception;
infer_o_antigen(prediction)
# if list of reportable serovars is provided to check prediction serovar against
if serovars_selected_list:
prediction.serovar_in_list = "N"
for selected_serovar in serovars_selected_list:
if selected_serovar == prediction.serovar:
print(selected_serovar, prediction.serovar)
prediction.serovar_in_list = "Y"
break

print(f"L288: sistr_cmd.py antigenic_formula: {prediction.antigenic_profile}")
logging.info('%s | Antigen gene BLAST serovar prediction: "%s" serogroup=%s %s:%s:%s',
genome_name,
prediction.serovar_antigen,
Expand Down
23 changes: 8 additions & 15 deletions sistr/src/serovar_prediction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,6 @@ def predict_antigens(self):

@staticmethod
def get_serovar(df, sg, h1, h2, spp):
#print(df.query(f'H1=="z"'))
h2_is_missing = '-' in h2
b_sg = df['Serogroup'].isin(sg)
b_h1 = df['H1'].isin(h1)
Expand Down Expand Up @@ -445,9 +444,7 @@ def predict_serovar_from_antigen_blast(self):

if not isinstance(h2, list):
h2 = [h2]
print(f"L444 sistr/src/serovar_prediction/__init__.py predict_serovar_from_antigen_blast() self.serovar = {self.serovar} self.subspecies = {self.subspecies}")
self.serovar = SerovarPredictor.get_serovar(df, sg, h1, h2, self.subspecies)
print(f"L446 sistr/src/serovar_prediction/__init__.py predict_serovar_from_antigen_blast() self.serovar = {self.serovar}")
if self.serovar is None:
try:
spp_roman = spp_name_to_roman[self.subspecies]
Expand Down Expand Up @@ -518,9 +515,6 @@ def overall_serovar_call(serovar_prediction, antigen_predictor):
"""
assert isinstance(serovar_prediction, SerovarPrediction)
assert isinstance(antigen_predictor, SerovarPredictor)
print(f"serovar_prediction: {serovar_prediction.__dict__}")
print(f"antigen_predictor: {antigen_predictor.__dict__}")
print(f"sistr/src/serovar_prediction/__init__.py L518: {serovar_prediction.serovar}")

h1 = antigen_predictor.h1
h2 = antigen_predictor.h2
Expand All @@ -533,7 +527,6 @@ def overall_serovar_call(serovar_prediction, antigen_predictor):
serovar_prediction.serovar_antigen = antigen_predictor.serovar #assign antigen serovar from antigen_predictor object
cgmlst_serovar = serovar_prediction.serovar_cgmlst
cgmlst_distance = float(serovar_prediction.cgmlst_distance)
print(f"sistr/src/serovar_prediction/__init__.py L530: serovar_prediction.serovar_antigen ={serovar_prediction.serovar_antigen} cgmlst_serovar = {serovar_prediction.serovar_cgmlst} mash_serovar = {serovar_prediction.mash_serovar}")

h1_h2_share_group = False
for h2_groups in H2_FLJB_SIMILARITY_GROUPS:
Expand Down Expand Up @@ -583,11 +576,8 @@ def overall_serovar_call(serovar_prediction, antigen_predictor):
serovar_prediction.h1_flic_prediction.h1 = h1


print(f"sistr/src/serovar_prediction/__init__.py L582: antigen_predictor.serovar = {antigen_predictor.serovar}")
antigen_predictor.predict_serovar_from_antigen_blast()
print(f"sistr/src/serovar_prediction/__init__.py L584: antigen_predictor.serovar = {antigen_predictor.serovar}")
serovar_prediction.serovar_antigen = antigen_predictor.serovar
print(f"sistr/src/serovar_prediction/__init__.py L584: serovar_prediction.serovar_antigen ={serovar_prediction.serovar_antigen}")


null_result = '-:-:-'
Expand All @@ -607,22 +597,25 @@ def overall_serovar_call(serovar_prediction, antigen_predictor):
serovar_prediction.serovar = '{} {}:{}:{}'.format(spp_roman, sg, h1, h2)
else:
serovar_prediction.serovar = '{}:{}:{}'.format(spp_roman, sg, h1, h2)
logging.info(f"Overall serovar assigned by antigen alleles serovar {serovar_prediction.serovar}")
elif cgmlst_serovar is not None and cgmlst_distance <= CGMLST_DISTANCE_THRESHOLD:
logging.info(f"Overall serovar assigned by cgMLST serovar {cgmlst_serovar} ...")
serovar_prediction.serovar = cgmlst_serovar
else:
serovar_prediction.serovar = null_result
if 'mash_match' in serovar_prediction.__dict__:
spd = serovar_prediction.__dict__
mash_dist = float(spd['mash_distance'])
if mash_dist <= MASH_DISTANCE_THRESHOLD:
logging.info(f"Overall serovar assigned by MASH serovar: {spd['mash_distance']} ...")
serovar_prediction.serovar = spd['mash_serovar']
else:
serovars_from_antigen = antigen_predictor.serovar.split('|')
if not isinstance(serovars_from_antigen, list):
serovars_from_antigen = [serovars_from_antigen]
if cgmlst_serovar is not None:
if cgmlst_serovar in serovars_from_antigen:
logging.info(f"Antigen predictor has multiple serovar results {antigen_predictor.serovar}, but assigned final cgmlst serovar {cgmlst_serovar} ...")
logging.info(f"Overall serovar assigned by cgMLST serovar {cgmlst_serovar} ...")
serovar_prediction.serovar = cgmlst_serovar

elif 'mash_match' in serovar_prediction.__dict__:
Expand All @@ -631,16 +624,16 @@ def overall_serovar_call(serovar_prediction, antigen_predictor):
mash_dist = float(spd['mash_distance'])
if mash_serovar in serovars_from_antigen:
serovar_prediction.serovar = mash_serovar
logging.info(f"Antigen predictor has a serovar result {antigen_predictor.serovar}, but assigned final mash serovar {mash_serovar} ...")
logging.info(f"Overall serovar assigned by MASH serovar {mash_serovar} ...")
else:
if mash_dist <= MASH_DISTANCE_THRESHOLD:
serovar_prediction.serovar = mash_serovar
logging.info(f"Antigen predictor has a serovar result {antigen_predictor.serovar}, but assigned final mash serovar {mash_serovar} ...")
logging.info(f"Overall serovar assigned by MASH serovar {mash_serovar} ...")
else:
logging.info(f"MASH serovar prediction was NOT assigned as mash distance {mash_dist} > {MASH_DISTANCE_THRESHOLD} ")
logging.info(f"Overall serovar NOT assigned by MASH serovar as closest ref. genome MASH distance {mash_dist} > {MASH_DISTANCE_THRESHOLD} ")

if serovar_prediction.serovar is None:
logging.info(f"Antigen predictor has a serovar result {antigen_predictor.serovar} and it will be assigned as a final serovar ...")
logging.info(f"Overall serovar assigned by antigen alleles serovar: {antigen_predictor.serovar}")
serovar_prediction.serovar = serovar_prediction.serovar_antigen

if serovar_prediction.h1 is None:
Expand Down

0 comments on commit 0d31550

Please sign in to comment.