From 94ee1be21f4c1ffccfbda59a6cb7aef1da4552ab Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 15 Nov 2022 11:14:27 +0000
Subject: [PATCH 01/67] term program if closing early

---
 cazy_webscraper/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cazy_webscraper/__init__.py b/cazy_webscraper/__init__.py
index a525f86e..8bf6f6ab 100644
--- a/cazy_webscraper/__init__.py
+++ b/cazy_webscraper/__init__.py
@@ -127,6 +127,9 @@ def closing_message(job, start_time, args, early_term=False):
     else:
         print(message)
 
+    if early_term:
+        sys.exit(1)
+
 
 def display_citation_info():
     """Display citation inforamtion.

From c2584757c64c55784cab00ed5ca848a4172acd4b Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 15 Nov 2022 11:41:36 +0000
Subject: [PATCH 02/67] add opt to use seqs from FASTA file

can use cached seqs in JSON and/or FASTA file, and can use a combiation of cache and db seqs
---
 .../sequences/get_genbank_sequences.py        | 327 ++++++++++++++----
 1 file changed, 255 insertions(+), 72 deletions(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index f638afd3..62a823f8 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -82,7 +82,7 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
     if logger is None:
         logger = logging.getLogger(__package__)
         config_logger(args)
-    
+
     logger.info("Providing user email address to NCBI.Entrez")
     Entrez.email = args.email
 
@@ -122,92 +122,262 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
             session,
             args,
         )
-    
-    # retrieve dict of genbank accession and genbank db ids from the local CAZyme db
-    if args.genbank_accessions is not None:
-        logger.warning(f"Getting GenBank accessions from file: {args.genbank_accessions}")
-        with open(args.genbank_accessions, "r") as fh:
-            lines = fh.read().splitlines()
-        
-        accessions = [line.strip() for line in lines]
-        accessions = set(accessions)
 
-        gbk_dict = get_selected_gbks.get_ids(accessions, connection)
+    seq_dict = get_cache_seqs(start_time, args)
+    gbk_dict = {}
 
-    else:
-        gbk_dict = get_selected_gbks.get_genbank_accessions(
+    if args.file_only:
+        logger.warning(
+            "Only adding Seqs in JSON and/or FASTA file.\n"
+            "Not retrieving seqs from NCBI\n"
+        )
+
+    else:  # retrieve data from NCBI for seqs in the local db
+        seq_dict, gbk_dict = get_seqs_from_ncbi(
             class_filters,
             family_filters,
             taxonomy_filter_dict,
             kingdom_filters,
             ec_filters,
+            seq_dict,
+            start_time,
             connection,
+            cache_dir,
+            args,
         )
 
+    logger.warning(f"Adding {len(list(seq_dict.keys()))} to the local CAZyme db")
+
+    # get acc and db ids for seqs to be added to the db from the cache
+    acc_with_unknown_db_id = list(set(seq_dict.keys()) - set(gbk_dict.keys()))
+    if len(acc_with_unknown_db_id) > 0:
+        gbk_dict.update(get_selected_gbks.get_ids(acc_with_unknown_db_id, connection))
+
+    if len(list(seq_dict.keys())) != 0:
+        closing_message()
+
+    add_genbank_data.add_gbk_seqs_to_db(seq_dict, date_today, gbk_dict, connection, args)
+
+    closing_message("Get GenBank Sequences", start_time, args)
+
+
+def get_seqs_from_ncbi(
+    class_filters,
+    family_filters,
+    taxonomy_filter_dict,
+    kingdom_filters,
+    ec_filters,
+    seq_dict,
+    start_time,
+    connection,
+    cache_dir,
+    args,
+):
+    """Coordinate retrieving sequence data from NCBI for proteins not retrieved from cache files
+
+    :param seq_dict: dict {id: seq} of seqs retrieved from cache json/fasta files
+    :param start_time: str: time program was called
+    :param connection: open connection to a SQLite db engine
+    :param args: CLI args parser
+
+    Return dicts of {acc: Bio.Seq} and {gbk acc: db id}
+    """
+    logger = logging.getLogger(__name__)
+
+    gbk_dict = get_records_to_retrieve(
+        class_filters,
+        family_filters,
+        taxonomy_filter_dict,
+        kingdom_filters,
+        ec_filters,
+        seq_dict,
+        start_time,
+        connection,
+        args,
+    )
+
     genbank_accessions = list(gbk_dict.keys())
-    logger.warning(f"Retrieving GenBank sequences for {len(gbk_dict.keys())}")  
+    logger.warning(f"Retrieving GenBank sequences from NCBI for {len(gbk_dict.keys())}")
+
+    if len(genbank_accessions) == 0:
+        return seq_dict
+
+    cache_path = cache_dir / f"genbank_seqs_{start_time}.fasta" 
+
+    downloaded_seqs, no_seq_acc = get_sequences(genbank_accessions, cache_path, args)
+
+    # cache accs of proteins for which not seq could be retrieved from NCBI
+    if len(no_seq_acc) != 0:
+        no_seq_cache = cache_dir / f"no_seq_retrieved_{start_time}.txt"
+
+        logger.warning(
+            f"No protein sequence retrieved for {len(no_seq_acc)} proteins\n"
+            f"The GenBank accessions for these proteins have been written to: {no_seq_cache}"
+        )
+
+        try:
+            with open(no_seq_cache, "a") as fh:
+                for acc in no_seq_acc:
+                    fh.write(f"{acc}\n")
+        except FileNotFoundError:
+            logger.error(
+                "Could not cache acc of proteins for which not seqs were retrieved from NCBI to:\n"
+                f"{no_seq_cache}"
+            )
+
+    # only cache the sequence. Seq obj is not JSON serializable
+    cache_dict = {}
+    for key in seq_dict:
+        cache_dict[key] = str(seq_dict[key])
+
+    # cache the retrieved sequences
+    cache_path = cache_dir / f"genbank_seqs_{start_time}.json"
+    with open(cache_path, "w") as fh:
+        json.dump(cache_dict, fh)
+
+    for record in downloaded_seqs:
+        seq_dict[record.id] = record.seq
+
+    return seq_dict, gbk_dict
+
+
+def get_cache_seqs(start_time, args):
+    """Extract protein sequences from FASTA and/or JSON file, which will be added to the local CAZyme database
+
+    :param seq_dict: dict, {genbank_acc: Bio.Seq}
+
+    Return update seq_dict
+    """
+    logger = logging.getLogger(__name__)
+
+    seq_dict = {}
 
     if args.seq_dict:
-        logger.warning(f"Getting sequences from cache: {args.seq_dict}")
-        with open(args.seq_dict, "r") as fh:
-            cache_dict = json.load(fh)
+        logger.warning(f"Getting sequences from JSON cache:\n{args.seq_dict}")
+
+        try:
+            with open(args.seq_dict, "r") as fh:
+                cache_dict = json.load(fh)
+
+        except FileNotFoundError:
+            logger.error(
+                f"Could not find JSON file of protein sequences at:\n"
+                f"{args.seq_dict}\n"
+                "Check the path is correct"
+                "Terminating program"
+            )
+            closing_message("Get GenBank seqs", start_time, args, early_term=True)
 
         # convert strs to SeqRecords
-        seq_dict = {}
         for key in cache_dict:
             seq_dict[key] = Seq(cache_dict[key])
 
-    else:
-        seq_dict, no_seq = get_sequences(genbank_accessions, args)  # {gbk_accession: seq}
+    if args.seq_file:
+        logger.warning(f"Getting sequences from FASTA cache:\n{args.seq_file}")
 
-        # only cache the sequence. Seq obj is not JSON serializable
-        cache_dict = {}
-        for key in seq_dict:
-            cache_dict[key] = str(seq_dict[key])
+        try:
+            for record in SeqIO.parse(args.seq_file, "fasta"):
+                try:
+                    seq_dict[record.id]
+                    if seq_dict[record.id] != record.seq:
+                        logger.warning(
+                            f"Retrieved seq for {record.id} from JSON file which does NOT match the "
+                            "seq in the FASTA file.\n"
+                            "Adding seq from the FASTA file to the local CAZyme database\n"
+                            f"JSON seq: {seq_dict[record.id]}\n"
+                            f"FASTA seq: {record.seq}"
+                        )
+                        seq_dict[record.id] = record.seq
+                except KeyError:
+                    seq_dict[record.id] = record.seq
+
+        except FileNotFoundError:
+            logger.error(
+                f"Could not find FASTA file of protein sequences at:\n"
+                f"{args.seq_file}\n"
+                "Check the path is correct"
+                "Terminating program"
+            )
+            closing_message("Get GenBank seqs", start_time, args, early_term=True)
+
+    return seq_dict
+
+
+def get_records_to_retrieve(
+    class_filters,
+    family_filters,
+    taxonomy_filter_dict,
+    kingdom_filters,
+    ec_filters,
+    seq_dict,
+    start_time,
+    connection,
+    args,
+):
+    """Get Genbank accessions to retrieved data from NCBI for
+
+    :param seq_dict: dict {id: seq} of seqs retrieved from cache json/fasta files
+    :param start_time: str: time program was called
+    :param connection: open connection to a SQLite db engine
+    :param args: CLI args parser
+
+    Return a dict {gbk_acc: gbk db id
+    """
+    logger = logging.getLogger(__name__)
 
-        # cache the retrieved sequences
-        cache_path = cache_dir / f"genbank_seqs_{time_stamp}.json"
-        with open(cache_path, "w") as fh:
-            json.dump(cache_dict, fh)
+    # retrieve dict of genbank accession and genbank db ids from the local CAZyme db
+    if args.genbank_accessions is not None:
+        logger.warning(f"Getting GenBank accessions from file: {args.genbank_accessions}")
 
-        if len(no_seq) != 0:
-            no_seq_cache = cache_dir / f"no_seq_retrieved_{time_stamp}.txt"
-            logger.warning(
-                f"No protein sequence retrieved for {len(no_seq)}\n"
-                f"The GenBank accessions for these proteins have been written to: {no_seq_cache}"
+        try:
+            with open(args.genbank_accessions, "r") as fh:
+                lines = fh.read().splitlines()
+        except FileNotFoundError:
+            logging.error(
+                "Could not find list of GenBank accessions at:\n"
+                f"{args.genbank_accessions}"
+                "Check the path is correct\n"
+                "Terminating program"
             )
-            with open(no_seq_cache, "a") as fh:
-                for acc in no_seq:
-                    fh.write(f"{acc}\n")
+            closing_message("Get GenBank seqs", start_time, args, early_term=True)
 
-    logger.warning(f"Adding {len(list(seq_dict.keys()))} protein seqs to the db")
+        accessions = [line.strip() for line in lines if line not in list(seq_dict.keys())]
+        accessions = set(accessions)
 
-    seq_records = []
-    for acc in seq_dict:
-        sequence = Seq(seq_dict[acc])
-        record = SeqRecord(sequence, id=acc)
-        seq_records.append(record)
-    
-    SeqIO.write(seq_records, "ce_gbk_protein_seqs.fasta", "fasta")
+        gbk_dict = get_selected_gbks.get_ids(accessions, connection)
 
-    add_genbank_data.add_gbk_seqs_to_db(seq_dict, date_today, gbk_dict, connection, args)
+    else:
+        gbk_dict = get_selected_gbks.get_genbank_accessions(
+            class_filters,
+            family_filters,
+            taxonomy_filter_dict,
+            kingdom_filters,
+            ec_filters,
+            connection,
+        )
+
+        # don't retrieve data for cached seqs
+        for gbk_acc in gbk_dict:
+            if gbk_acc in list(seq_dict.keys()):
+                del gbk_dict[gbk_acc]
 
-    closing_message("get_genbank_sequences", start_time, args)
+    return gbk_dict
 
 
-def get_sequences(genbank_accessions, args, retry=False):
+def get_sequences(genbank_accessions, cache_path, args, retry=False):
     """Retrieve protein sequences from Entrez.
 
     :param genbank_accessions: list, GenBank accessions
+    :param cache_path: Path, to fasta file to write out retrieved protein seqs
     :param args: cmb-line args parser
     :param retry: bool, default False, if get_sequences is being called for retrying a previously failed query
 
-    Return dict keyed by GenBank accession and valued by Seq instance, and a list of all GenBank accessions 
+    Return list of SeqRecords, and a list of all GenBank accessions 
     for which no record from NCBI was retrieved.
     """
     logger = logging.getLogger(__name__)
 
-    seq_dict = {}  # {gbk_accession: SeqRecord}
+    seq_records = []  # SeqRecords, can't be set, SeqRecords are unhasable
 
     # the list of accessions is to long, break down into smaller chunks for batch querying
     all_queries = get_chunks_list(genbank_accessions, args.batch_size)
@@ -220,6 +390,7 @@ def get_sequences(genbank_accessions, args, retry=False):
 
     for query_list in tqdm(all_queries, desc="Batch querying NCBI.Entrez"):
 
+        # POST IDS
         try:
             epost_webenv, epost_query_key = bulk_query_ncbi(query_list, args)
         except RuntimeError:
@@ -234,12 +405,12 @@ def get_sequences(genbank_accessions, args, retry=False):
 
             failed_queries.append(query_list)
             continue
-    
+
         if epost_webenv is None:
             return None, None
 
+        # Retrieve seqs
         try:
-            # retrieve the protein sequences
             with entrez_retry(
                 args.retries,
                 Entrez.efetch,
@@ -253,26 +424,22 @@ def get_sequences(genbank_accessions, args, retry=False):
                     temp_accession = record.id
 
                     # check if multiple items returned in ID
-                    temp_accession = temp_accession.split("|")
-                    retrieved_accession = None
-
-                    for acc in temp_accession:
-                        if acc.strip() in genbank_accessions:
-                            retrieved_accession = acc
-
-                    if retrieved_accession is None:  # if could not retrieve GenBank accession from the record
+                    try:
+                        retrieved_accession = [_ for _ in record.id.split("|") if _.strip() in genbank_accessions][0]
+                    except IndexError:
                         logger.error(
-                            "Could not retrieve a GenBank protein accession matching an accession from the local database from:\n"
+                            "Could not retrieve a GenBank protein accession matching "
+                            "an accession from the local database from:\n"
                             f"{record.id}\n"
-                            "The sequence from this record will not be added to the db"    
+                            "The sequence from this record will not be added to the db"   
                         )
                         irregular_accessions.append(temp_accession)
                         continue
-                        
-                    seq_dict[retrieved_accession] = record.seq
 
-                    success_accessions.add(retrieved_accession)
-        
+                    if retrieved_accession not in success_accessions:
+                        seq_records.add(record)
+                        success_accessions.add(retrieved_accession)
+
         except IncompleteRead as err:
             logger.warning(
                 "IncompleteRead error raised:\n"
@@ -286,13 +453,29 @@ def get_sequences(genbank_accessions, args, retry=False):
             failed_queries.append(all_queries)
             continue
 
+        except Exception as err:
+            logger.warning(
+                "Error raised:\n"
+                f"{err}\n"
+                "Will reattempt NCBI query later"
+            )
+
+            if retry:
+                return None, None
+
+            failed_queries.append(all_queries)
+            continue
+
+        # cache the currently retrieved seqs
+        SeqIO.write(seq_records, cache_path, "fasta")
+
     # list of GenBank accessions for which no protein sequence was retrieved
 
-    no_seq = [acc for acc in genbank_accessions if acc not in success_accessions]
+    no_seq = list(set(genbank_accessions) - set(success_accessions))
     no_seq += irregular_accessions
 
     if retry:
-        return seq_dict, no_seq
+        return seq_records, no_seq
 
     if len(failed_queries) != 0:
         for failed_query in tqdm(failed_queries, desc="Reparsing failed queries"):
@@ -364,28 +547,28 @@ def bulk_query_ncbi(accessions, args):
 def retry_failed_queries(query, seq_dict, success_accessions, args):
     """Parse queries which previously raised a Runtime Error, likey the result of containing accessions 
     that are not present in NCBI.
-    
+
     :param query: list GenBank accessions
     :param seq_dict: dict, keyed by GenBank accession, valued by BioPython Seq obj
     :param success_acessions: list of GenBank accessions for which a protein sequence was retrieved
     :param args: cmd-line args parser
-    
+
     Return seq_dict and success_accessions with data from the reparsed failed queries
     """
     failed_accessions = []  # accessions of proteins for which no record was retrieved from NCBI.Entrez
 
     new_seq_dict, no_seq = get_sequences(query, args, retry=True)
-    
+
     if new_seq_dict is None:  # failed retrieval of data from NCBI
         for acc in query:
             new_seq_dict, no_seq = get_sequences([acc], args, retry=True)
-        
+
             if new_seq_dict is not None:  # retrieve data for this accession
                 seq_dict[acc] = new_seq_dict[acc]
 
             else:  # failed to retrieve data for this accession
                 failed_accessions.append(acc)
-    
+
     else:  # successful retrieval of data from NCBI
         for acc in new_seq_dict:
             seq_dict[acc] = new_seq_dict[acc]

From 53d2506123d7217e4f829efc1511d5a8d0298fc3 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 15 Nov 2022 11:43:48 +0000
Subject: [PATCH 03/67] shorten line lengths

---
 .../sequences/get_genbank_sequences.py        | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index 62a823f8..1a632190 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -202,7 +202,7 @@ def get_seqs_from_ncbi(
     if len(genbank_accessions) == 0:
         return seq_dict
 
-    cache_path = cache_dir / f"genbank_seqs_{start_time}.fasta" 
+    cache_path = cache_dir / f"genbank_seqs_{start_time}.fasta"
 
     downloaded_seqs, no_seq_acc = get_sequences(genbank_accessions, cache_path, args)
 
@@ -242,7 +242,8 @@ def get_seqs_from_ncbi(
 
 
 def get_cache_seqs(start_time, args):
-    """Extract protein sequences from FASTA and/or JSON file, which will be added to the local CAZyme database
+    """Extract protein sequences from FASTA and/or JSON file, which will be added to the
+    local CAZyme database
 
     :param seq_dict: dict, {genbank_acc: Bio.Seq}
 
@@ -281,8 +282,8 @@ def get_cache_seqs(start_time, args):
                     seq_dict[record.id]
                     if seq_dict[record.id] != record.seq:
                         logger.warning(
-                            f"Retrieved seq for {record.id} from JSON file which does NOT match the "
-                            "seq in the FASTA file.\n"
+                            f"Retrieved seq for {record.id} from JSON file which does NOT match "
+                            "the seq in the FASTA file.\n"
                             "Adding seq from the FASTA file to the local CAZyme database\n"
                             f"JSON seq: {seq_dict[record.id]}\n"
                             f"FASTA seq: {record.seq}"
@@ -370,9 +371,10 @@ def get_sequences(genbank_accessions, cache_path, args, retry=False):
     :param genbank_accessions: list, GenBank accessions
     :param cache_path: Path, to fasta file to write out retrieved protein seqs
     :param args: cmb-line args parser
-    :param retry: bool, default False, if get_sequences is being called for retrying a previously failed query
+    :param retry: bool, default False, if get_sequences is being called for retrying a previously
+    failed query
 
-    Return list of SeqRecords, and a list of all GenBank accessions 
+    Return list of SeqRecords, and a list of all GenBank accessions
     for which no record from NCBI was retrieved.
     """
     logger = logging.getLogger(__name__)
@@ -382,7 +384,8 @@ def get_sequences(genbank_accessions, cache_path, args, retry=False):
     # the list of accessions is to long, break down into smaller chunks for batch querying
     all_queries = get_chunks_list(genbank_accessions, args.batch_size)
 
-    failed_queries = []  # lists which raised an error, likely because contain an accession not in NCBI
+    failed_queries = []  # lists which raised an error,
+    # likely because contain an accession not in NCBI
 
     irregular_accessions = []
 
@@ -431,7 +434,7 @@ def get_sequences(genbank_accessions, cache_path, args, retry=False):
                             "Could not retrieve a GenBank protein accession matching "
                             "an accession from the local database from:\n"
                             f"{record.id}\n"
-                            "The sequence from this record will not be added to the db"   
+                            "The sequence from this record will not be added to the db"
                         )
                         irregular_accessions.append(temp_accession)
                         continue
@@ -545,8 +548,8 @@ def bulk_query_ncbi(accessions, args):
 
 
 def retry_failed_queries(query, seq_dict, success_accessions, args):
-    """Parse queries which previously raised a Runtime Error, likey the result of containing accessions 
-    that are not present in NCBI.
+    """Parse queries which previously raised a Runtime Error, likey the result of
+    containing accessions that are not present in NCBI.
 
     :param query: list GenBank accessions
     :param seq_dict: dict, keyed by GenBank accession, valued by BioPython Seq obj
@@ -555,7 +558,7 @@ def retry_failed_queries(query, seq_dict, success_accessions, args):
 
     Return seq_dict and success_accessions with data from the reparsed failed queries
     """
-    failed_accessions = []  # accessions of proteins for which no record was retrieved from NCBI.Entrez
+    failed_accessions = []  # Acc of proteins for which no record was retrieved from NCBI.Entrez
 
     new_seq_dict, no_seq = get_sequences(query, args, retry=True)
 

From d53203cafe8e2a9d2c34aa52f6c1eaa85d790c8f Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 15 Nov 2022 11:49:46 +0000
Subject: [PATCH 04/67] update docs with new CLI args

---
 README.md                                     |  6 +++++
 .../utilities/parsers/gbk_seq_parser.py       | 24 ++++++++++++++++++-
 docs/genbank.rst                              |  6 ++++-
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d008f7f8..5007c96b 100644
--- a/README.md
+++ b/README.md
@@ -593,6 +593,8 @@ Providing the file types **is** case sensitive, but the order the file types are
 cw_get_uniprot_data my_cazyme_db/cazyme_db.db --ec_filter 'EC1.2.3.4,EC2.3.1.-'
 ```
 
+`-F`, `--file_only` - Only add seqs provided via JSON and/or FASTA file. Do not retrieved data from NCBI.
+
 `--families` - List of CAZy (sub)families to scrape.
 
 `--genbank_accessions` - Path to text file containing a list of GenBank accessions to retrieve protein data for. A unique accession per line.
@@ -613,6 +615,10 @@ cw_get_uniprot_data my_cazyme_db/cazyme_db.db --ec_filter 'EC1.2.3.4,EC2.3.1.-'
 
 `--retries`, `-r` - Define the number of times to retry making a connection to CAZy if the connection should fail. Default: 10.
 
+`--seq_dict` - Path to a JSON file, keyed by GenBank accessions and valued by protein sequence. Add seqs in file to the local CAZyme database.
+
+`--seq_file` - Path to a FASTA file, keyed by GenBank accessions and valued by protein sequence. Add seqs in file to the local CAZyme database.
+
 `--sql_echo` - Set SQLite engine echo parameter to True, causing SQLite to print log messages. Default: False.
 
 `--species` - List of species written as Genus Species) to restrict the scraping of CAZymes to. CAZymes will be retrieved for **all** strains of each given species.
diff --git a/cazy_webscraper/utilities/parsers/gbk_seq_parser.py b/cazy_webscraper/utilities/parsers/gbk_seq_parser.py
index e7432608..030082fb 100644
--- a/cazy_webscraper/utilities/parsers/gbk_seq_parser.py
+++ b/cazy_webscraper/utilities/parsers/gbk_seq_parser.py
@@ -130,6 +130,15 @@ def build_parser(argv: Optional[List] = None):
         help="Force file in existing cache directory",
     )
 
+    parser.add_argument(
+        "-F",
+        "--file_only",
+        dest="file_only",
+        action="store_true",
+        default=False,
+        help="Only add seqs provided via JSON and/or FASTA file. Do not retrieved data from NCBI",
+    )
+
     # Add option to specify families to retrieve protein sequences for
     parser.add_argument(
         "--families",
@@ -195,7 +204,20 @@ def build_parser(argv: Optional[List] = None):
         "--seq_dict",
         type=Path,
         default=None,
-        help="Path to a JSON file, keyed by GenBank accessions and valued by protein sequence",
+        help=(
+            "Path to a JSON file, keyed by GenBank accessions and valued by protein sequence\n"
+            "Add seqs in file to the local CAZyme database"
+        ),
+    )
+
+    parser.add_argument(
+        "--seq_file",
+        type=Path,
+        default=None,
+        help=(
+            "Path to a FASTA file of protein sequences\n"
+            "Add seqs in file to the local CAZyme database"
+        ),
     )
 
     # Add option to update sequences if the retrieved sequence is different
diff --git a/docs/genbank.rst b/docs/genbank.rst
index c1a6758a..469ccf4b 100644
--- a/docs/genbank.rst
+++ b/docs/genbank.rst
@@ -60,6 +60,8 @@ Command line options
 
 ``--force``, ``-f`` - Force writing cachce to exiting cache directory.
 
+``--file_only``, ``-F`` - Only add seqs provided via JSON and/or FASTA file. Do not retrieved data from NCBI.
+
 ``--families`` - List of CAZy (sub)families to retrieve UniProt protein data for.
 
 ``--genbank_accessions`` - Path to text file containing a list of GenBank accessions to retrieve protein data for. A unique accession per line.
@@ -76,7 +78,9 @@ Command line options
 
 ``--retries``, ``-r`` - Define the number of times to retry making a connection to CAZy if the connection should fail. Default: 10.
 
-``--seq_dict``, - Path to a JSON file, keyed by GenBank accessions and valued by protein sequence. This file is created as part of the cache, after all protein sequences are retrieved from GenBank. This skips the retrieval of the protein sequences from GenBank.
+``--seq_dict``, - Path to a JSON file, keyed by GenBank accessions and valued by protein sequence. This file is created as part of the cache, after all protein sequences are retrieved from GenBank. This skips the retrieval of the protein sequences from GenBank only for those seqs included in the file.
+
+``--seq_file``, - Path to a JSON file, keyed by GenBank accessions and valued by protein sequence. This file is created as part of the cache, after all protein sequences are retrieved from GenBank. This skips the retrieval of the protein sequences from GenBank only for those seqs included in the file.
 
 ``--seq_update`` - If a newer version of the protein sequence is available, overwrite the existing sequence for the protein in the database. Default is false, the protein sequence is **not** overwritten and updated.
 

From ec6cb5473019f3a3d79c125e70cf8901d0624fda Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 15 Nov 2022 15:36:13 +0000
Subject: [PATCH 05/67] add tutorial for using genbank seq cahce

---
 docs/genbanktutorial.rst | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/docs/genbanktutorial.rst b/docs/genbanktutorial.rst
index cf78584f..70d99a80 100644
--- a/docs/genbanktutorial.rst
+++ b/docs/genbanktutorial.rst
@@ -286,3 +286,26 @@ CAZymes of interest.
    Therefore, if ``--genbank_accessions`` and ``--classes`` are used, ``cw_get_genbank_seqs`` will ignore 
    the ``--classes`` flag and only retrieve protein sequences for the proteins listed in the file provided via 
    the ``--genbank_accessions``.
+
+
+-------------------------------
+Providing sequences from a file
+-------------------------------
+
+While ``cw_get_genbank_seqs`` is retrieving protein sequences from NCBI, the retrieved protein sequences 
+are written to a FASTA file in the cache directory.
+
+To add sequences from a cached FASTA file (e.g. to continue a download that was previously interrupted) and/or 
+add GenBank sequences from a previous download (e.g. by a colleage), use the ``--seq_file`` flag followed by 
+the path to the FASTA containing the protein sequences to be added to the database. The ID for 
+each sequence **must** be the NCBI protein version accession.
+
+``cw_get_genbank_seqs`` also generates a JSON file of the cached sequences. To add sequences from the 
+cached JSON file to the local CAZyme database, use the ``--seq_dict`` flag followed by the path to the 
+JSON file.
+
+By default ``cw_get_genbank_seqs`` will add sequences retrieved from the FASTA and/or JSON file **and** will retrieve 
+protein sequences from NCBI for proteins matching the provided criteria to define proteins of interest. 
+
+To add **only** the sequences from a FASTA and/or JSON file, and **not** download any sequences from NCBI, use 
+the ``--file_only`` flag.

From 8c4144651cbb2be6e34099e4a28ffa9a3a8acdb4 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 15 Nov 2022 15:40:00 +0000
Subject: [PATCH 06/67] cache seqs as retrieved. refactorise code

---
 .../sequences/get_genbank_sequences.py        | 312 ++++++++----------
 1 file changed, 139 insertions(+), 173 deletions(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index 1a632190..fae6e89d 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -161,86 +161,6 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
     closing_message("Get GenBank Sequences", start_time, args)
 
 
-def get_seqs_from_ncbi(
-    class_filters,
-    family_filters,
-    taxonomy_filter_dict,
-    kingdom_filters,
-    ec_filters,
-    seq_dict,
-    start_time,
-    connection,
-    cache_dir,
-    args,
-):
-    """Coordinate retrieving sequence data from NCBI for proteins not retrieved from cache files
-
-    :param seq_dict: dict {id: seq} of seqs retrieved from cache json/fasta files
-    :param start_time: str: time program was called
-    :param connection: open connection to a SQLite db engine
-    :param args: CLI args parser
-
-    Return dicts of {acc: Bio.Seq} and {gbk acc: db id}
-    """
-    logger = logging.getLogger(__name__)
-
-    gbk_dict = get_records_to_retrieve(
-        class_filters,
-        family_filters,
-        taxonomy_filter_dict,
-        kingdom_filters,
-        ec_filters,
-        seq_dict,
-        start_time,
-        connection,
-        args,
-    )
-
-    genbank_accessions = list(gbk_dict.keys())
-    logger.warning(f"Retrieving GenBank sequences from NCBI for {len(gbk_dict.keys())}")
-
-    if len(genbank_accessions) == 0:
-        return seq_dict
-
-    cache_path = cache_dir / f"genbank_seqs_{start_time}.fasta"
-
-    downloaded_seqs, no_seq_acc = get_sequences(genbank_accessions, cache_path, args)
-
-    # cache accs of proteins for which not seq could be retrieved from NCBI
-    if len(no_seq_acc) != 0:
-        no_seq_cache = cache_dir / f"no_seq_retrieved_{start_time}.txt"
-
-        logger.warning(
-            f"No protein sequence retrieved for {len(no_seq_acc)} proteins\n"
-            f"The GenBank accessions for these proteins have been written to: {no_seq_cache}"
-        )
-
-        try:
-            with open(no_seq_cache, "a") as fh:
-                for acc in no_seq_acc:
-                    fh.write(f"{acc}\n")
-        except FileNotFoundError:
-            logger.error(
-                "Could not cache acc of proteins for which not seqs were retrieved from NCBI to:\n"
-                f"{no_seq_cache}"
-            )
-
-    # only cache the sequence. Seq obj is not JSON serializable
-    cache_dict = {}
-    for key in seq_dict:
-        cache_dict[key] = str(seq_dict[key])
-
-    # cache the retrieved sequences
-    cache_path = cache_dir / f"genbank_seqs_{start_time}.json"
-    with open(cache_path, "w") as fh:
-        json.dump(cache_dict, fh)
-
-    for record in downloaded_seqs:
-        seq_dict[record.id] = record.seq
-
-    return seq_dict, gbk_dict
-
-
 def get_cache_seqs(start_time, args):
     """Extract protein sequences from FASTA and/or JSON file, which will be added to the
     local CAZyme database
@@ -365,24 +285,145 @@ def get_records_to_retrieve(
     return gbk_dict
 
 
-def get_sequences(genbank_accessions, cache_path, args, retry=False):
+def get_seqs_from_ncbi(
+    class_filters,
+    family_filters,
+    taxonomy_filter_dict,
+    kingdom_filters,
+    ec_filters,
+    seq_dict,
+    start_time,
+    connection,
+    cache_dir,
+    args,
+):
+    """Coordinate retrieving sequence data from NCBI for proteins not retrieved from cache files
+
+    :param seq_dict: dict {id: seq} of seqs retrieved from cache json/fasta files
+    :param start_time: str: time program was called
+    :param connection: open connection to a SQLite db engine
+    :param args: CLI args parser
+
+    Return dicts of {acc: Bio.Seq} and {gbk acc: db id}
+    """
+    logger = logging.getLogger(__name__)
+
+    gbk_dict = get_records_to_retrieve(
+        class_filters,
+        family_filters,
+        taxonomy_filter_dict,
+        kingdom_filters,
+        ec_filters,
+        seq_dict,
+        start_time,
+        connection,
+        args,
+    )
+
+    genbank_accessions = list(gbk_dict.keys())
+    logger.warning(f"Retrieving GenBank sequences from NCBI for {len(gbk_dict.keys())}")
+
+    if len(genbank_accessions) == 0:
+        return seq_dict
+
+    cache_path = cache_dir / f"genbank_seqs_{start_time}.fasta"
+
+    # break up long list into managable chunks
+    all_queries = get_chunks_list(genbank_accessions, args.batch_size)
+
+    # list of downloaded SeqRecords and list of gbk acc for which no seq was retrieved from NCBI
+    downloaded_seqs, failed_queries = get_sequences(all_queries, cache_dir, args)
+
+    # retry failed accs
+    no_seq_acc = []
+    if len(failed_queries) != 0:
+        # break up and query individually
+        retrying_acc = {}  # {acc: # of tries}
+        for batch in failed_queries:
+            for acc in batch:
+                retrying_acc[acc] = 0
+
+        finished_retry = False
+
+        acc_list = list(retrying_acc)
+
+        no_seq_acc = set()  # set of accessions for which no seq could be retrieved
+
+        while finished_retry is False:
+            acc_list = list(retrying_acc)
+
+            for accession in acc_list:
+                new_seq, failed_seq = get_sequences([[accession]], cache_dir, args)
+
+                if len(new_seq) == 0:
+                    retrying_acc[accession] += 1
+
+                    if retrying_acc[accession] >= args.retries:
+                        del retrying_acc[accession]
+                        no_seq_acc.add(accession)
+
+            acc_list = list(retrying_acc)
+
+            if len(acc_list) > 0:
+                finished_retry = True
+
+    # cache accs of proteins for which not seq could be retrieved from NCBI
+    if len(no_seq_acc) != 0:
+        no_seq_cache = cache_dir / f"no_seq_retrieved_{start_time}.txt"
+
+        logger.warning(
+            f"No protein sequence retrieved for {len(no_seq_acc)} proteins\n"
+            f"The GenBank accessions for these proteins have been written to: {no_seq_cache}"
+        )
+
+        try:
+            with open(no_seq_cache, "a") as fh:
+                for acc in no_seq_acc:
+                    fh.write(f"{acc}\n")
+        except FileNotFoundError:
+            logger.error(
+                "Could not cache acc of proteins for which not seqs were retrieved from NCBI to:\n"
+                f"{no_seq_cache}"
+            )
+
+    # only cache the sequence. Seq obj is not JSON serializable
+    cache_dict = {}
+    for key in seq_dict:
+        cache_dict[key] = str(seq_dict[key])
+
+    # cache the retrieved sequences
+    cache_path = cache_dir / f"genbank_seqs_{start_time}.json"
+    with open(cache_path, "w") as fh:
+        json.dump(cache_dict, fh)
+
+    for record in downloaded_seqs:
+        seq_dict[record.id] = record.seq
+
+    return seq_dict, gbk_dict
+
+
+def get_sequences(batches, cache_dir, args):
     """Retrieve protein sequences from Entrez.
 
-    :param genbank_accessions: list, GenBank accessions
-    :param cache_path: Path, to fasta file to write out retrieved protein seqs
+    :param batches: list of lists, one list be batch of gbk acc to query against NCBI
+    :param cache_dir: Path, to cache directory
     :param args: cmb-line args parser
-    :param retry: bool, default False, if get_sequences is being called for retrying a previously
     failed query
 
-    Return list of SeqRecords, and a list of all GenBank accessions
-    for which no record from NCBI was retrieved.
+    Return
+    * list of SeqRecords
+    * list of failed batches
     """
     logger = logging.getLogger(__name__)
 
-    seq_records = []  # SeqRecords, can't be set, SeqRecords are unhasable
+    cache_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    cache_path = cache_dir / f"genbank_seqs_{cache_time}.json"
 
-    # the list of accessions is to long, break down into smaller chunks for batch querying
-    all_queries = get_chunks_list(genbank_accessions, args.batch_size)
+    gbk_acc_to_retrieve = []
+    for batch in batches:
+        gbk_acc_to_retrieve += [acc for acc in batch]
+
+    seq_records = []  # SeqRecords, can't be set, SeqRecords are unhasable
 
     failed_queries = []  # lists which raised an error,
     # likely because contain an accession not in NCBI
@@ -391,11 +432,11 @@ def get_sequences(genbank_accessions, cache_path, args, retry=False):
 
     success_accessions = set()  # accessions for which seqs were retrieved
 
-    for query_list in tqdm(all_queries, desc="Batch querying NCBI.Entrez"):
+    for batch in tqdm(batches, desc="Batch querying NCBI.Entrez"):
 
         # POST IDS
         try:
-            epost_webenv, epost_query_key = bulk_query_ncbi(query_list, args)
+            epost_webenv, epost_query_key = bulk_query_ncbi(batch, args)
         except RuntimeError:
             logger.warning(
                 "Runtime error raised when batch quering\n"
@@ -403,14 +444,12 @@ def get_sequences(genbank_accessions, cache_path, args, retry=False):
                 "Attempt identification of the causal accession later\n"
             )
 
-            if retry:
-                return None, None
-
-            failed_queries.append(query_list)
+            failed_queries.append(batch)
             continue
 
         if epost_webenv is None:
-            return None, None
+            failed_queries.append(batch)
+            continue
 
         # Retrieve seqs
         try:
@@ -428,7 +467,7 @@ def get_sequences(genbank_accessions, cache_path, args, retry=False):
 
                     # check if multiple items returned in ID
                     try:
-                        retrieved_accession = [_ for _ in record.id.split("|") if _.strip() in genbank_accessions][0]
+                        retrieved_accession = [_ for _ in record.id.split("|") if _.strip() in gbk_acc_to_retrieve][0]
                     except IndexError:
                         logger.error(
                             "Could not retrieve a GenBank protein accession matching "
@@ -450,10 +489,7 @@ def get_sequences(genbank_accessions, cache_path, args, retry=False):
                 "Will reattempt NCBI query later"
             )
 
-            if retry:
-                return None, None
-
-            failed_queries.append(all_queries)
+            failed_queries.append(batch)
             continue
 
         except Exception as err:
@@ -463,10 +499,7 @@ def get_sequences(genbank_accessions, cache_path, args, retry=False):
                 "Will reattempt NCBI query later"
             )
 
-            if retry:
-                return None, None
-
-            failed_queries.append(all_queries)
+            failed_queries.append(batch)
             continue
 
         # cache the currently retrieved seqs
@@ -474,39 +507,7 @@ def get_sequences(genbank_accessions, cache_path, args, retry=False):
 
     # list of GenBank accessions for which no protein sequence was retrieved
 
-    no_seq = list(set(genbank_accessions) - set(success_accessions))
-    no_seq += irregular_accessions
-
-    if retry:
-        return seq_records, no_seq
-
-    if len(failed_queries) != 0:
-        for failed_query in tqdm(failed_queries, desc="Reparsing failed queries"):
-            first_half = failed_query[:int((len(failed_query)/2))]
-
-            seq_dict, success_accessions, failed_accessions = retry_failed_queries(
-                first_half,
-                seq_dict,
-                success_accessions,
-                args,
-            )
-
-            no_seq += failed_accessions
-
-            second_half = failed_query[int((len(failed_query)/2)):]
-
-            seq_dict, success_accessions, failed_accessions = retry_failed_queries(
-                second_half,
-                seq_dict,
-                success_accessions,
-                args,
-            )
-
-            no_seq += failed_accessions
-
-    logger.warning(f"Retrieved sequences for {len(success_accessions)} proteins")
-
-    return seq_dict, no_seq
+    return seq_records, failed_queries
 
 
 def bulk_query_ncbi(accessions, args):
@@ -547,40 +548,5 @@ def bulk_query_ncbi(accessions, args):
     return epost_webenv, epost_query_key
 
 
-def retry_failed_queries(query, seq_dict, success_accessions, args):
-    """Parse queries which previously raised a Runtime Error, likey the result of
-    containing accessions that are not present in NCBI.
-
-    :param query: list GenBank accessions
-    :param seq_dict: dict, keyed by GenBank accession, valued by BioPython Seq obj
-    :param success_acessions: list of GenBank accessions for which a protein sequence was retrieved
-    :param args: cmd-line args parser
-
-    Return seq_dict and success_accessions with data from the reparsed failed queries
-    """
-    failed_accessions = []  # Acc of proteins for which no record was retrieved from NCBI.Entrez
-
-    new_seq_dict, no_seq = get_sequences(query, args, retry=True)
-
-    if new_seq_dict is None:  # failed retrieval of data from NCBI
-        for acc in query:
-            new_seq_dict, no_seq = get_sequences([acc], args, retry=True)
-
-            if new_seq_dict is not None:  # retrieve data for this accession
-                seq_dict[acc] = new_seq_dict[acc]
-
-            else:  # failed to retrieve data for this accession
-                failed_accessions.append(acc)
-
-    else:  # successful retrieval of data from NCBI
-        for acc in new_seq_dict:
-            seq_dict[acc] = new_seq_dict[acc]
-            success_accessions.add(acc)
-
-        failed_accessions += no_seq
-
-    return seq_dict, success_accessions, failed_accessions
-
-
 if __name__ == "__main__":
     main()

From cf3e395987bc4b3d387c61fcf64c5ddcc230af14 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 15 Nov 2022 15:51:33 +0000
Subject: [PATCH 07/67] compress the cache directory once done

---
 .../expand/genbank/sequences/get_genbank_sequences.py    | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index fae6e89d..00d0b600 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -43,6 +43,7 @@
 import json
 import logging
 import pandas as pd
+import shutil
 
 from datetime import datetime
 from typing import List, Optional
@@ -153,6 +154,14 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
     if len(acc_with_unknown_db_id) > 0:
         gbk_dict.update(get_selected_gbks.get_ids(acc_with_unknown_db_id, connection))
 
+    try:
+        shutil.make_archive(cache_dir, 'tar',  cache_dir)
+        shutil.rmtree(cache_dir)
+    except FileNotFoundError:
+        logger.error(
+            "Failed to compress cache directory"
+        )
+
     if len(list(seq_dict.keys())) != 0:
         closing_message()
 

From c504074bbc056516c19a04215c4e9fa4fa191c4b Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 15 Nov 2022 15:56:24 +0000
Subject: [PATCH 08/67] update version num

---
 README.md                   | 2 +-
 cazy_webscraper/__init__.py | 2 +-
 docs/index.rst              | 2 --
 setup.py                    | 2 +-
 4 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 5007c96b..d6e57cf1 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 [![CircleCI](https://circleci.com/gh/HobnobMancer/cazy_webscraper.svg?style=shield)](https://circleci.com/gh/HobnobMancer/cazy_webscraper)
 [![codecov](https://codecov.io/gh/HobnobMancer/cazy_webscraper/branch/master/graph/badge.svg)](https://codecov.io/gh/HobnobMancer/cazy_webscraper)
 [![Documentation Status](https://readthedocs.org/projects/cazy-webscraper/badge/?version=latest)](https://cazy-webscraper.readthedocs.io/en/latest/?badge=latest)  
-[![Anaconda-Server Badge](https://anaconda.org/bioconda/cazy_webscraper/badges/installer/conda.svg)](https://anaconda.org/bioconda/cazy_webscraper)
+[![Anaconda-Server Badge](https://anaconda.org/bioconda/cazy_webscraper/badges/version.svg)](https://anaconda.org/bioconda/cazy_webscraper)
 [![Anaconda-Update Badge](https://anaconda.org/bioconda/cazy_webscraper/badges/latest_release_date.svg)](https://anaconda.org/bioconda/cazy_webscraper)  
 [![pyani PyPi version](https://img.shields.io/pypi/v/cazy_webscraper "PyPI version")](https://pypi.python.org/pypi/cazy_webscraper)  
 [![Downloads](https://pepy.tech/badge/cazy-webscraper)](https://pepy.tech/project/cazy-webscraper)
diff --git a/cazy_webscraper/__init__.py b/cazy_webscraper/__init__.py
index 8bf6f6ab..8c0557ab 100644
--- a/cazy_webscraper/__init__.py
+++ b/cazy_webscraper/__init__.py
@@ -54,7 +54,7 @@
 from cazy_webscraper.sql import sql_orm
 
 
-__version__ = "2.2.2"
+__version__ = "2.2.3"
 
 
 VERSION_INFO = f"cazy_webscraper version: {__version__}"
diff --git a/docs/index.rst b/docs/index.rst
index 6c885ca5..828c0681 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -41,8 +41,6 @@ Build Information
 ``bioconda``
 ------------
 
-.. image:: https://anaconda.org/bioconda/cazy_webscraper/badges/installer/conda.svg?style=flat-square
-     :target: https://conda.anaconda.org/bioconda
 .. image:: https://anaconda.org/bioconda/cazy_webscraper/badges/version.svg?style=flat-square
     :target: https://anaconda.org/bioconda/cazy_webscraper
 .. image:: https://anaconda.org/bioconda/cazy_webscraper/badges/latest_release_date.svg?style=flat-square
diff --git a/setup.py b/setup.py
index e949b289..364382cf 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
 
 setuptools.setup(
     name="cazy_webscraper",
-    version="2.2.2",
+    version="2.2.3",
     # Metadata
     author="Emma E. M. Hobbs",
     author_email="eemh1@st-andrews.ac.uk",

From 3f84d95d31abf73d4f2d013a9645858b65b8a0b5 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 15 Nov 2022 15:57:28 +0000
Subject: [PATCH 09/67] update installation instructions

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index d6e57cf1..261c27c1 100644
--- a/README.md
+++ b/README.md
@@ -57,8 +57,6 @@ Protein sequences (retrieved from GenBank and/or UniProt) from the local CAZyme
 
 Please see the [full documentation at ReadTheDocs](https://cazy-webscraper.readthedocs.io/en/latest/?badge=latest).
 
-**The `bioconda` installation method is not currently supported, but we are working on getting this fixed soon**. For now please install via pypi or from source.
-
 
 ## Table of Contents
 <!-- TOC -->

From d01a69fbc350b0b0ae71080f45b6188b1d592fa6 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 15 Nov 2022 16:05:22 +0000
Subject: [PATCH 10/67] update closing message unit tests

---
 tests/test_webscraper.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/test_webscraper.py b/tests/test_webscraper.py
index 733f5d8f..2c7871c3 100644
--- a/tests/test_webscraper.py
+++ b/tests/test_webscraper.py
@@ -359,7 +359,9 @@ def mock_parser(*args, **kwargs):
     monkeypatch.setattr(ArgumentParser, "parse_args", mock_parser)
     monkeypatch.setattr(saint_logger,  "config_logger", mock_config_logger)
 
-    cazy_scraper.main()
+    with pytest.raises(SystemExit) as pytest_wrapped_e:
+        cazy_scraper.main()
+    assert pytest_wrapped_e.type == SystemExit
 
 
 def test_main_new_db_exists_force(db_path, mock_building_parser, mock_config_logger, monkeypatch):
@@ -656,7 +658,9 @@ def test_closing_message_verbose():
         )
     }
 
-    closing_message('cazy_webscraper', start_time, argsdict['args'], early_term=True)
+    with pytest.raises(SystemExit) as pytest_wrapped_e:
+        closing_message('cazy_webscraper', start_time, argsdict['args'], early_term=True)
+    assert pytest_wrapped_e.type == SystemExit
 
 
 # # # test get_cazy_data()
@@ -753,9 +757,12 @@ def test_term_message():
     start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     start_time = pd.to_datetime(start_time)
 
-    closing_message(
-        job="Test",
-        start_time=start_time,
-        args=argsdict["args"],
-        early_term=True
-    )
+
+    with pytest.raises(SystemExit) as pytest_wrapped_e:
+        closing_message(
+            job="Test",
+            start_time=start_time,
+            args=argsdict["args"],
+            early_term=True
+        )
+    assert pytest_wrapped_e.type == SystemExit

From b396a498f9543223530855cdc7107d4d23f3f544 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 15 Nov 2022 16:38:56 +0000
Subject: [PATCH 11/67] correct add to append for list

---
 .../expand/genbank/sequences/get_genbank_sequences.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index 00d0b600..b6da0fa5 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -488,7 +488,7 @@ def get_sequences(batches, cache_dir, args):
                         continue
 
                     if retrieved_accession not in success_accessions:
-                        seq_records.add(record)
+                        seq_records.append(record)
                         success_accessions.add(retrieved_accession)
 
         except IncompleteRead as err:

From b7c2f41fcab187082074d1cd0a24ed40f253f2bb Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 15 Nov 2022 16:41:43 +0000
Subject: [PATCH 12/67] add logging messages

---
 .../expand/genbank/sequences/get_genbank_sequences.py            | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index b6da0fa5..9d135d9b 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -346,6 +346,7 @@ def get_seqs_from_ncbi(
     # retry failed accs
     no_seq_acc = []
     if len(failed_queries) != 0:
+        logger.warning("Parsing accessions which could not retrieve a seq for the first time")
         # break up and query individually
         retrying_acc = {}  # {acc: # of tries}
         for batch in failed_queries:

From 6512e0eeb588792009e4a267f196b91eed3c8baf Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Wed, 16 Nov 2022 08:24:28 +0000
Subject: [PATCH 13/67] IncompleteRead error capture when posting ids

---
 .../expand/genbank/sequences/get_genbank_sequences.py       | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index 9d135d9b..56d0a3bf 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -550,6 +550,12 @@ def bulk_query_ncbi(accessions, args):
     except NotXMLError:
         logger.error("Could not parse Entrez output")
         return None, None
+    except IncompleteRead:
+        logger.error(
+            "IncompleteRead error raised when posting protein record(s)\n"
+            "Will try again later"
+        )
+        return None, None
 
     # retrieve the web environment and query key from the Entrez post
     epost_webenv = epost_result["WebEnv"]

From 297e1c3753d4a4dd571fedfa4e18e95db8801825 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Wed, 16 Nov 2022 15:59:27 +0000
Subject: [PATCH 14/67] ignore blank links in the acc file

---
 cazy_webscraper/expand/uniprot/get_uniprot_data.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/cazy_webscraper/expand/uniprot/get_uniprot_data.py b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
index 46d3088a..8ea024ce 100644
--- a/cazy_webscraper/expand/uniprot/get_uniprot_data.py
+++ b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
@@ -140,7 +140,7 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
         with open(args.genbank_accessions, "r") as fh:
             lines = fh.read().splitlines()
         
-        accessions = [line.strip() for line in lines]
+        accessions = [line.strip() for line in lines if len(line) != 0]
         accessions = set(accessions)
 
         gbk_dict = get_selected_gbks.get_ids(accessions, connection)
@@ -238,12 +238,16 @@ def get_uniprot_data(uniprot_gbk_dict, cache_dir, args):
 
     # add PDB column to columns to be retrieved
     UniProt()._valid_columns.append('database(PDB)')
-    
+
+    print(list(uniprot_gbk_dict.keys()))
+
     bioservices_queries = get_chunks_list(
         list(uniprot_gbk_dict.keys()),
         args.bioservices_batch_size,
     )
 
+    print(bioservices_queries)
+
     for query in tqdm(bioservices_queries, "Batch retrieving protein data from UniProt"):
         uniprot_df = UniProt().get_df(entries=query, limit=None)
 
@@ -262,7 +266,6 @@ def get_uniprot_data(uniprot_gbk_dict, cache_dir, args):
             'Cross-reference (PDB)',
         ]]
 
-
         for index in tqdm(range(len(uniprot_df['Entry'])), desc="Parsing UniProt response"):
             row = uniprot_df.iloc[index]
             uniprot_acc = row['Entry'].strip()

From 908dcce7b3a73d7d9fcd1fe431d8d026c97b97aa Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 29 Nov 2022 09:43:14 +0000
Subject: [PATCH 15/67] use re to retrieve ncbi accessions

---
 .../sequences/get_genbank_sequences.py        | 72 +++++++++++++------
 1 file changed, 50 insertions(+), 22 deletions(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index 56d0a3bf..f2b92435 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -43,6 +43,7 @@
 import json
 import logging
 import pandas as pd
+import re
 import shutil
 
 from datetime import datetime
@@ -152,7 +153,7 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
     # get acc and db ids for seqs to be added to the db from the cache
     acc_with_unknown_db_id = list(set(seq_dict.keys()) - set(gbk_dict.keys()))
     if len(acc_with_unknown_db_id) > 0:
-        gbk_dict.update(get_selected_gbks.get_ids(acc_with_unknown_db_id, connection))
+        gbk_dict.update(get_selected_gbks.get_ids(acc_with_unknown_db_id, connection, cache_dir))
 
     try:
         shutil.make_archive(cache_dir, 'tar',  cache_dir)
@@ -162,10 +163,13 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
             "Failed to compress cache directory"
         )
 
-    if len(list(seq_dict.keys())) != 0:
-        closing_message()
+    if len(list(seq_dict.keys())) == 0:
+        logger.warning("Adding 0 sequences to the local CAZyme database")
+        closing_message("Get GenBank Sequences", start_time, args)
 
-    add_genbank_data.add_gbk_seqs_to_db(seq_dict, date_today, gbk_dict, connection, args)
+    # make subdir in cache dir for logging failed seqs
+    make_output_directory(cache_dir, force=True, nodelete=True)
+    add_genbank_data.add_gbk_seqs_to_db(seq_dict, date_today, gbk_dict, connection, cache_dir, args)
 
     closing_message("Get GenBank Sequences", start_time, args)
 
@@ -286,8 +290,10 @@ def get_records_to_retrieve(
             connection,
         )
 
+        gbk_accs = list(gbk_dict.keys())
+
         # don't retrieve data for cached seqs
-        for gbk_acc in gbk_dict:
+        for gbk_acc in gbk_accs:
             if gbk_acc in list(seq_dict.keys()):
                 del gbk_dict[gbk_acc]
 
@@ -346,7 +352,10 @@ def get_seqs_from_ncbi(
     # retry failed accs
     no_seq_acc = []
     if len(failed_queries) != 0:
-        logger.warning("Parsing accessions which could not retrieve a seq for the first time")
+        logger.warning(
+            "Parsing accessions which could not retrieve a seq for the first time\n"
+            f"Handling {len(failed_queries)} failed batches"
+        )
         # break up and query individually
         retrying_acc = {}  # {acc: # of tries}
         for batch in failed_queries:
@@ -355,24 +364,36 @@ def get_seqs_from_ncbi(
 
         finished_retry = False
 
-        acc_list = list(retrying_acc)
+        acc_list = list(retrying_acc.keys())
 
         no_seq_acc = set()  # set of accessions for which no seq could be retrieved
 
         while finished_retry is False:
-            acc_list = list(retrying_acc)
+            acc_list = list(retrying_acc.keys())
 
-            for accession in acc_list:
-                new_seq, failed_seq = get_sequences([[accession]], cache_dir, args)
+            logger.warning(
+                "Handling failed protein accessions\n"
+                f"{len(acc_list)} protein accessions remaining"
+            )
+
+            for accession in tqdm(acc_list, desc="Handling failed accessions"):
+                new_seq, failed_seq = get_sequences([[accession]], cache_dir, args, disable_prg_bar=True)
 
                 if len(new_seq) == 0:
+                    logger.warning(
+                        f"Failed to retrieve sequence for {accession} on try no. {retrying_acc[accession]}"
+                    )
                     retrying_acc[accession] += 1
 
                     if retrying_acc[accession] >= args.retries:
+                        logger.warning(
+                            f"Could not retrieve sequence for {accession}\n"
+                            "Ran out of attempts"
+                        )
                         del retrying_acc[accession]
                         no_seq_acc.add(accession)
 
-            acc_list = list(retrying_acc)
+            acc_list = list(retrying_acc.keys())
 
             if len(acc_list) > 0:
                 finished_retry = True
@@ -412,7 +433,7 @@ def get_seqs_from_ncbi(
     return seq_dict, gbk_dict
 
 
-def get_sequences(batches, cache_dir, args):
+def get_sequences(batches, cache_dir, args, disable_prg_bar=False):
     """Retrieve protein sequences from Entrez.
 
     :param batches: list of lists, one list be batch of gbk acc to query against NCBI
@@ -427,7 +448,7 @@ def get_sequences(batches, cache_dir, args):
     logger = logging.getLogger(__name__)
 
     cache_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    cache_path = cache_dir / f"genbank_seqs_{cache_time}.json"
+    cache_path = cache_dir / f"genbank_seqs_{cache_time}.fasta"
 
     gbk_acc_to_retrieve = []
     for batch in batches:
@@ -442,7 +463,7 @@ def get_sequences(batches, cache_dir, args):
 
     success_accessions = set()  # accessions for which seqs were retrieved
 
-    for batch in tqdm(batches, desc="Batch querying NCBI.Entrez"):
+    for batch in tqdm(batches, desc="Batch querying NCBI.Entrez", disable=disable_prg_bar):
 
         # POST IDS
         try:
@@ -479,14 +500,21 @@ def get_sequences(batches, cache_dir, args):
                     try:
                         retrieved_accession = [_ for _ in record.id.split("|") if _.strip() in gbk_acc_to_retrieve][0]
                     except IndexError:
-                        logger.error(
-                            "Could not retrieve a GenBank protein accession matching "
-                            "an accession from the local database from:\n"
-                            f"{record.id}\n"
-                            "The sequence from this record will not be added to the db"
-                        )
-                        irregular_accessions.append(temp_accession)
-                        continue
+                        # try re search for accession in string
+                        try:
+                            retrieved_accession = re.match(r"\D{3}\d+\.\d+", record.id).group()
+                        except AttributeError:
+                            try:
+                                retrieved_accession = re.match(r"\D{2}_\d+\.\d+", record.id).group()
+                            except AttributeError:
+                                logger.error(
+                                    "Could not retrieve a GenBank protein accession matching "
+                                    "an accession from the local database from:\n"
+                                    f"{record.id}\n"
+                                    "The sequence from this record will not be added to the db"
+                                )
+                                irregular_accessions.append(temp_accession)
+                                continue
 
                     if retrieved_accession not in success_accessions:
                         seq_records.append(record)

From 8e0bbd165976f4caa81dd46c2ae1a0a678acf213 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 29 Nov 2022 09:44:34 +0000
Subject: [PATCH 16/67] add unit tests and remove unused imports

---
 .../add_data/add_genbank_data.py              | 38 +++++++++++++++----
 .../get_data/get_selected_gbks.py             | 18 ++++++++-
 2 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/cazy_webscraper/sql/sql_interface/add_data/add_genbank_data.py b/cazy_webscraper/sql/sql_interface/add_data/add_genbank_data.py
index 6e36751d..86d44693 100644
--- a/cazy_webscraper/sql/sql_interface/add_data/add_genbank_data.py
+++ b/cazy_webscraper/sql/sql_interface/add_data/add_genbank_data.py
@@ -48,28 +48,52 @@
 from cazy_webscraper.sql.sql_interface.get_data.get_table_dicts import get_gbk_table_seq_dict
 
 
-def add_gbk_seqs_to_db(seq_dict, retrieval_date, gbk_dict, connection, args, unit_test=False):
+def add_gbk_seqs_to_db(seq_dict, retrieval_date, gbk_dict, connection, cache_dir, args, unit_test=False):
     """Add sequences retrieved from GenBank to the local CAZyme db
-    
+
     :param seq_dict: dict of retrieved seq {gbk_acc: seq}
     :param retrieval_date: str, date seqs were retrieved in ISO format
     :param gbk_dict: dict {gbk_acc: db_id}
     :param connection: open sqlalchemy connection to a SQLite db
+    :param cache_dir: path to cache directory
     :param args: cmd-line args parser
-    
+
     Return nothing
     """
     logger = logging.getLogger(__name__)
-    
+    cache_path = cache_dir / "seqs_with_no_db_id"
+
     # load the current Genbanks table into a dict
     gbk_table_dict = get_gbk_table_seq_dict(connection)
 
     records = set()  # records to update, contains tuples (gbk_acc, seq, )
 
     for gbk_accession in tqdm(seq_dict, desc="Compiling records for db insertion"):
-        existing_record = gbk_table_dict[gbk_accession]
+        try:
+            existing_record = gbk_table_dict[gbk_accession]
+        except KeyError:
+            logger.error(
+                f"Could not retrieve Genbanks table record for acc {gbk_accession}\n"
+                f"with seq:\n{seq_dict[gbk_accession]}\n"
+                "Not adding sequence to the local CAZyme database"
+            )
+            with open(cache_path, "a") as fh:
+                fh.write(f"{gbk_accession}\n")
+            continue
         seq = seq_dict[gbk_accession]
-        db_id = gbk_dict[gbk_accession]
+        try:
+            db_id = gbk_dict[gbk_accession]
+        except KeyError:
+            logger.error(
+                f"Could not retrieve record with accessions '{gbk_accession}'\n"
+                "from the local CAZyme database.\n"
+                "Not adding this protein to the local CAZyme database."
+                "Logging accessoin in:\n"
+                f"{cache_path}"
+            )
+            with open(cache_path, "a") as fh:
+                fh.write(f"{gbk_accession}\n")
+            continue
 
         if existing_record['sequence'] is None:
             records.add( (db_id, seq) )
@@ -87,7 +111,7 @@ def add_gbk_seqs_to_db(seq_dict, retrieval_date, gbk_dict, connection, args, uni
                         f"WHERE genbank_id = '{record[0]}'"
                     )
                 )
-                #, seq_update_date = {retrieval_date} 
+                # seq_update_date = {retrieval_date} 
                 connection.execute(
                     text(
                         "UPDATE Genbanks "
diff --git a/cazy_webscraper/sql/sql_interface/get_data/get_selected_gbks.py b/cazy_webscraper/sql/sql_interface/get_data/get_selected_gbks.py
index a892922a..6a71db77 100644
--- a/cazy_webscraper/sql/sql_interface/get_data/get_selected_gbks.py
+++ b/cazy_webscraper/sql/sql_interface/get_data/get_selected_gbks.py
@@ -68,14 +68,17 @@
 }
 
 
-def get_ids(genbank_accessions, connection):
+def get_ids(genbank_accessions, connection, cache_dir):
     """Get the local CAZyme database IDs for the list of provided GenBank accessions.
     
     :param genbank_accessions: set of GenBank accessions
     :param connection: open sqlalchemy engine connection
+    :param cache_dir: path to cache directory
     
     Return dict, keyed by GenBank accession and valued by database record ID.
     """
+    cache_path = cache_dir / "seqs_with_no_db_id"
+    logger = logging.getLogger(__name__)
     gbk_dict = {}
 
     for accession in tqdm(genbank_accessions, desc="Getting local db record IDs"):
@@ -84,7 +87,18 @@ def get_ids(genbank_accessions, connection):
                 filter(Genbank.genbank_accession == accession).\
                 first()
         
-        gbk_dict[accession] = gbk_query.genbank_id
+        try:
+            gbk_dict[accession] = gbk_query.genbank_id
+        except AttributeError:
+            logger.error(
+                f"Could not retrieve record with accessions {accession}\n"
+                "from the local CAZyme database.\n"
+                "Not adding this protein to the local CAZyme database."
+                "Logging accessoin in:\n"
+                f"{cache_path}"
+            )
+            with open(cache_path, "a") as fh:
+                fh.write(f"{accession}\n")
 
     return gbk_dict
     

From 7546238160a54d77d64058c663afd67017df4c1f Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 29 Nov 2022 09:45:46 +0000
Subject: [PATCH 17/67] update unit tests

---
 tests/test_sql_ad_genbank.py | 11 ++++++++++-
 tests/test_webscraper.py     |  1 -
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/test_sql_ad_genbank.py b/tests/test_sql_ad_genbank.py
index 4ff60eb8..261544a3 100644
--- a/tests/test_sql_ad_genbank.py
+++ b/tests/test_sql_ad_genbank.py
@@ -45,7 +45,10 @@
 """
 
 
+from pathlib import Path
 import pytest
+import shutil
+import sys
 
 from argparse import Namespace
 
@@ -98,11 +101,17 @@ def mock_gbk_table_seq_dict(*args, **kwards):
 
     monkeypatch.setattr(add_genbank_data, "get_gbk_table_seq_dict", mock_gbk_table_seq_dict)
 
+    cache_dir = Path("tests/test_outputs/test_outputs_sql/temp_dir")
+
     add_genbank_data.add_gbk_seqs_to_db(
         seq_dict,
         retrieval_data,
         gbk_dict,
         connection,
+        cache_dir,
         argsdict['args'],
         unit_test=True,
-    )
\ No newline at end of file
+    )
+
+    shutil.rmtree(cache_dir)
+    cache_dir.mkdir(exist_ok=True)
diff --git a/tests/test_webscraper.py b/tests/test_webscraper.py
index 2c7871c3..33e13fdd 100644
--- a/tests/test_webscraper.py
+++ b/tests/test_webscraper.py
@@ -757,7 +757,6 @@ def test_term_message():
     start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     start_time = pd.to_datetime(start_time)
 
-
     with pytest.raises(SystemExit) as pytest_wrapped_e:
         closing_message(
             job="Test",

From 8f281990777eb295851beda94f536a50c5128449 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Wed, 11 Jan 2023 10:18:43 +0000
Subject: [PATCH 18/67] fix merge conflicts

---
 cazy_webscraper/__init__.py | 2 +-
 setup.py                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cazy_webscraper/__init__.py b/cazy_webscraper/__init__.py
index 8c0557ab..ca7a769f 100644
--- a/cazy_webscraper/__init__.py
+++ b/cazy_webscraper/__init__.py
@@ -54,7 +54,7 @@
 from cazy_webscraper.sql import sql_orm
 
 
-__version__ = "2.2.3"
+__version__ = "2.2.4"
 
 
 VERSION_INFO = f"cazy_webscraper version: {__version__}"
diff --git a/setup.py b/setup.py
index 364382cf..77aabccc 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
 
 setuptools.setup(
     name="cazy_webscraper",
-    version="2.2.3",
+    version="2.2.4",
     # Metadata
     author="Emma E. M. Hobbs",
     author_email="eemh1@st-andrews.ac.uk",

From 91422ea6edfe48d2c63578edf1e0f4275f85d3b4 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 18 Nov 2022 20:34:18 +0000
Subject: [PATCH 19/67] update version number

---
 cazy_webscraper/__init__.py | 2 +-
 setup.py                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cazy_webscraper/__init__.py b/cazy_webscraper/__init__.py
index ca7a769f..cffc8dd2 100644
--- a/cazy_webscraper/__init__.py
+++ b/cazy_webscraper/__init__.py
@@ -54,7 +54,7 @@
 from cazy_webscraper.sql import sql_orm
 
 
-__version__ = "2.2.4"
+__version__ = "2.2.5"
 
 
 VERSION_INFO = f"cazy_webscraper version: {__version__}"
diff --git a/setup.py b/setup.py
index 77aabccc..568abaca 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
 
 setuptools.setup(
     name="cazy_webscraper",
-    version="2.2.4",
+    version="2.2.5",
     # Metadata
     author="Emma E. M. Hobbs",
     author_email="eemh1@st-andrews.ac.uk",

From 36c482b2b7e64937f0a7da865ab49e9ffaffe2fe Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Mon, 21 Nov 2022 11:09:13 +0000
Subject: [PATCH 20/67] refactor funcs

---
 .../expand/uniprot/get_uniprot_data.py        | 197 +++++++++++++-----
 1 file changed, 145 insertions(+), 52 deletions(-)

diff --git a/cazy_webscraper/expand/uniprot/get_uniprot_data.py b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
index 8ea024ce..a6bcdc58 100644
--- a/cazy_webscraper/expand/uniprot/get_uniprot_data.py
+++ b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
@@ -110,7 +110,76 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
 
     # add log to the local CAZyme database
     logger.info("Adding log of scrape to the local CAZyme database")
+    add_db_log(
+        config_dict,
+        taxonomy_filter_dict,
+        kingdom_filters,
+        ec_filters,
+        time_stamp,
+        connection,
+        args,
+    )
+
+    # get dick to GenBank accessions and local db IDs
+    gbk_dict = get_db_gbk_accs(
+        class_filters,
+        family_filters,
+        taxonomy_filter_dict,
+        kingdom_filters,
+        ec_filters,
+        connection,
+        args,
+    )
+
+    # get cached data, or return an empty dict and set
+    uniprot_dict, all_ecs, gbk_data_to_download = get_uniprot_cache(gbk_dict, args)
+
+    if args.skip_download is False:
+        logger.warning(f"Retrieving data for {len(gbk_data_to_download)} proteins")
+
+        downloaded_uniprot_data, all_ecs = get_uniprot_data(uniprot_dict, gbk_data_to_download, cache_dir, args)
+
+        uniprot_dict.update(downloaded_uniprot_data)
+
+    else:
+        logger.warning(f"Using only data from cache:\n{args.use_uniprot_cache}")
+
+    if len(list(downloaded_uniprot_data.keys())) != 0:
+        cache_uniprot_data(uniprot_dict, cache_dir, time_stamp)
+
+    # add uniprot accessions (and sequences if seq retrieval is enabled)
+    logger.warning("Adding data to the local CAZyme database")
+    add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args)
 
+    # add ec numbers
+    if (args.ec) and (len(all_ecs) != 0):
+        logger.warning("Adding EC numbers to the local CAZyme database")
+        add_ec_numbers(uniprot_dict, all_ecs, gbk_dict, connection, args)
+
+    # add pdb accessions
+    if args.pdb:
+        logger.warning("Adding RSCB PDB IDs to the local CAZyme database")
+        add_pdb_accessions(uniprot_dict, gbk_dict, connection, args)
+
+    closing_message("get_uniprot_data", start_time, args)
+
+
+def add_db_log(
+    config_dict,
+    taxonomy_filter_dict,
+    kingdom_filters,
+    ec_filters,
+    time_stamp,
+    connection,
+    args,
+):
+    """Add log to local db
+
+    :param connection: open connection to SQLite db engine
+    :param args: CLI args parser
+    
+    Return nothing
+    """
     retrieved_annotations = "UniProt accessions, Protein names"
     if args.ec:
         retrieved_annotations += ", EC numbers"
@@ -134,9 +203,26 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
             args,
         )
 
+
+def get_db_gbk_accs(
+    class_filters,
+    family_filters,
+    taxonomy_filter_dict,
+    kingdom_filters,
+    ec_filters,
+    connection,
+    args
+):
+    """Get the GenBank accessions of proteins to retrieve UniProt data for.
+
+    :param args: CLI args parser
+
+    Return dict {gbk_acc: local db id}
+    """
     # retrieve dict of genbank accession and genbank db ids from the local CAZyme db
     if args.genbank_accessions is not None:
         logger.warning(f"Getting GenBank accessions from file: {args.genbank_accessions}")
+
         with open(args.genbank_accessions, "r") as fh:
             lines = fh.read().splitlines()
         
@@ -157,72 +243,56 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
 
     logger.warning(f"Retrieving UniProt data for {len(gbk_dict.keys())}")
 
+    return gbk_dict
+
+
+def get_uniprot_cache(gbk_dict, args):
+    """Get cached UniProt data, or return empty dict and set.
+    
+    :param gbk_dict: {gbk acc: local db id}
+    :param args: CLI args parser
+
+    Return dict {uniprot: {genbank_accession: str, uniprot_name: str, pdb: set, ec: set}}
+    and set of EC numbers
+    and list of GenBank accessions to download UniProt data for
+    """
     # if using cachce skip accession retrieval
+    uniprot_dict = {}  # {uniprot: {genbank_accession: str, uniprot_name: str, pdb: set, ec: set}}
+    all_ecs = set()
+    gbk_data_to_download = []
+
     if args.use_uniprot_cache is not None:
-        logger.warning(f"Using UniProt data from cache: {args.use_uniprot_cache}")
+        logger.warning(f"Getting UniProt data from cache: {args.use_uniprot_cache}")
+
         with open(args.use_uniprot_cache, "r") as fh:
             uniprot_dict = json.load(fh)
 
         if args.ec:
             all_ecs = get_ecs_from_cache(uniprot_dict)
-        else:
-            all_ecs = set()
-
-    else:
-
-        # Get the UniProt accessions/IDs for the corresponding GenBank accessions
-        if args.skip_uniprot_accessions is not None:
-            logger.warning(f"Using UniProt accessions from cache: {args.skip_uniprot_accessions}")
-            with open(args.skip_uniprot_accessions, "r") as fh:
-                uniprot_gkb_dict = json.load(fh)
-
-        else:
-            uniprot_gkb_dict = get_uniprot_accessions(gbk_dict, args)  # {uniprot_acc: {'gbk_acc': str, 'db_id': int}}
-
-            uniprot_acc_cache = cache_dir / f"uniprot_accessions_{time_stamp}.json"
-            with open(uniprot_acc_cache, "w") as fh:
-                json.dump(uniprot_gkb_dict, fh)
-
-        # get data from UniProt
-        uniprot_dict, all_ecs = get_uniprot_data(uniprot_gkb_dict, cache_dir, args)
-
-        # converts sets to lists for json serialisation
-        for uniprot_accession in uniprot_dict:
-            try:
-                uniprot_dict[uniprot_accession]['ec'] = list(uniprot_dict[uniprot_accession]['ec'])
-            except KeyError:
-                pass
-            try:
-                uniprot_dict[uniprot_accession]['pdb'] = list(uniprot_dict[uniprot_accession]['pdb'])
-            except KeyError:
-                pass
-
-        uniprot_acc_cache = cache_dir / f"uniprot_data_{time_stamp}.json"
-        with open(uniprot_acc_cache, "w") as fh:
-            json.dump(uniprot_dict, fh)   
-
-    # add uniprot accessions (and sequences if seq retrieval is enabled)
-    logger.warning("Adding data to the local CAZyme database")
-    add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args)
-
-    # add ec numbers
-    if (args.ec) and (len(all_ecs) != 0):
-        add_ec_numbers(uniprot_dict, all_ecs, gbk_dict, connection, args)
-
-    # add pdb accessions
-    if args.pdb:
-        add_pdb_accessions(uniprot_dict, gbk_dict, connection, args)
-
-    closing_message("get_uniprot_data", start_time, args)
+    
+    if args.skip_download:  # only use cached data
+        return uniprot_dict, all_ecs, gbk_data_to_download
+
+    # else: check for which GenBank accessions data still needs be retrieved from UniProt
+    # if some of the data is used from a cache, if no data is provided from a cache
+    # retrieve data for all GenBank accesisons matching the provided criteria
+    if len(list(uniprot_dict.keys())) != 0: 
+        for uniprot_acc in tqdm(uniprot_dict):
+            gbk_data_to_download.append(uniprot_dict[uniprot_acc]['genbank_accession'])
+    else:  # get data for all GenBank accessions from the local db matching the user criteria
+        gbk_data_to_download = list(gbk_dict.keys())
+    
+    return uniprot_dict, all_ecs, gbk_data_to_download
 
 
-def get_uniprot_data(uniprot_gbk_dict, cache_dir, args):
+def get_uniprot_data(uniprot_dict, gbk_data_to_download, cache_dir, args):
     """Batch query UniProt to retrieve protein data. Save data to cache directory.
     
     Bioservices requests batch queries no larger than 200.
 
-    :param uniprot_gbk_dict: dict, keyed by UniProt accession and valued by dict of a GenBank accession
+    :param uniprot_dict: dict, keyed by UniProt accession and valued by dict of a GenBank accession
         and the local CAZyme db record ID {uniprot_acc: {'gbk_acc': str, 'db_id': int}}
+    :param gbk_data_to_download: list of NCBI protein accessions to query UniProt with
     :param cache_dir: path to directory to write out cache
     :param args: cmd-line args parser
     
@@ -401,5 +471,28 @@ def get_ecs_from_cache(uniprot_dict):
     return all_ecs
 
 
+def cache_uniprot_data(uniprot_dict, cache_dir, time_stamp):
+    """Cache data retrieved from UniProt.
+
+    :param
+
+    Return nothing
+    """
+    # cache updated UniProt data
+    for uniprot_accession in uniprot_dict:
+        try:
+            uniprot_dict[uniprot_accession]['ec'] = list(uniprot_dict[uniprot_accession]['ec'])
+        except KeyError:
+            pass
+        try:
+            uniprot_dict[uniprot_accession]['pdb'] = list(uniprot_dict[uniprot_accession]['pdb'])
+        except KeyError:
+            pass
+
+    uniprot_acc_cache = cache_dir / f"uniprot_data_{time_stamp}.json"
+    with open(uniprot_acc_cache, "w") as fh:
+        json.dump(uniprot_dict, fh) 
+
+
 if __name__ == "__main__":
     main()

From 9faaae011e8ec69eebf4be91b50a20f63409bcc7 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Mon, 21 Nov 2022 11:10:20 +0000
Subject: [PATCH 21/67] define missing logger

---
 cazy_webscraper/expand/uniprot/get_uniprot_data.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cazy_webscraper/expand/uniprot/get_uniprot_data.py b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
index a6bcdc58..164a6066 100644
--- a/cazy_webscraper/expand/uniprot/get_uniprot_data.py
+++ b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
@@ -219,6 +219,8 @@ def get_db_gbk_accs(
 
     Return dict {gbk_acc: local db id}
     """
+    logger = logging.getLogger(__name__)
+
     # retrieve dict of genbank accession and genbank db ids from the local CAZyme db
     if args.genbank_accessions is not None:
         logger.warning(f"Getting GenBank accessions from file: {args.genbank_accessions}")

From c503b5aea87bbdbc9dd365349b998e15396bf0e9 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Mon, 21 Nov 2022 11:12:23 +0000
Subject: [PATCH 22/67] update parser

chage skip_download to bool not path
---
 .../utilities/parsers/uniprot_parser.py          | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/cazy_webscraper/utilities/parsers/uniprot_parser.py b/cazy_webscraper/utilities/parsers/uniprot_parser.py
index 241781d8..3b92b1ae 100644
--- a/cazy_webscraper/utilities/parsers/uniprot_parser.py
+++ b/cazy_webscraper/utilities/parsers/uniprot_parser.py
@@ -251,6 +251,15 @@ def build_parser(argv: Optional[List] = None):
         ),
     )
 
+
+    parser.add_argument(
+        "--skip_download",
+        dest="skip_download",
+        action="store_true",
+        default=False,
+        help="Skip downloading data from UniProt. Use when only using data from cache",
+    )
+
     # Add option to force file over writting
     parser.add_argument(
         "--sql_echo",
@@ -289,13 +298,6 @@ def build_parser(argv: Optional[List] = None):
         help="Connection timeout limit (seconds)"
     )
 
-    parser.add_argument(
-        "--skip_uniprot_accessions",
-        type=Path,
-        default=None,
-        help="Path to a JSON file containing UniProt IDs, GenBank accessions and db IDs",
-    )  
-
     parser.add_argument(
         "--use_uniprot_cache",
         type=Path,

From f38925fdb1ab20230511e7671a80b2c6cc3d32fd Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 15:23:45 +0000
Subject: [PATCH 23/67] get protein acc for gene name

Retrieve the NCBI protein version accession for the corresponding gene name rerieved from UniProt.
---
 cazy_webscraper/ncbi/gene_names/__init__.py | 252 ++++++++++++++++++++
 1 file changed, 252 insertions(+)
 create mode 100644 cazy_webscraper/ncbi/gene_names/__init__.py

diff --git a/cazy_webscraper/ncbi/gene_names/__init__.py b/cazy_webscraper/ncbi/gene_names/__init__.py
new file mode 100644
index 00000000..e9cfcdf7
--- /dev/null
+++ b/cazy_webscraper/ncbi/gene_names/__init__.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# (c) University of St Andrews 2022
+# (c) University of Strathclyde 2022
+# Author:
+# Emma E. M. Hobbs
+
+# Contact
+# eemh1@st-andrews.ac.uk
+
+# Emma E. M. Hobbs,
+# Biomolecular Sciences Building,
+# University of St Andrews,
+# North Haugh Campus,
+# St Andrews,
+# KY16 9ST
+# Scotland,
+# UK
+
+# The MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Get the protein records for a provided list of gene names"""
+
+
+import logging
+
+from saintBioutils.genbank import entrez_retry
+from saintBioutils.misc import get_chunks_list
+from tqdm import tqdm
+
+
+def get_linked_ncbi_accessions(uniprot_dict):
+    """Get the NCBI protein version accessions for the gene names retrieved from UniProt.
+
+    :param uniprot_dict: {uniprot_acc: {gene_name: str, protein_name: str, pdb: set, ec: set, sequence:str, seq_data:str}}
+
+    Return uniprot_dict with the ncbi protein accessions added.
+    """
+    logger = logging.getLogger(__name__)
+
+    gene_names = {}
+    for uniprot_acc in uniprot_dict:
+        gene_names[[uniprot_acc]['gene_name']] = uniprot_acc
+
+    ncbi_queries = get_chunks_list(
+        list(gene_names.keys()),
+        args.ncbi_batch_size,
+    )
+
+    invalid_gene_names = set()   # gene names no longer listed in NCBI
+    failed_batches = []
+
+    for batch in tqdm(ncbi_queries, desc="Batch quering NCBI to get protein accesions for gene names"):
+        uniprot_dict, invalid_gene_names, failed_batches = process_batch(
+            batch
+            uniprot_dict,
+            invalid_gene_names,
+            failed_batches,
+        )
+
+    if len(failed_batches) != 0:
+        failed_names = {}
+
+        # remove invalid IDs
+        for batch in failed_batches:
+            batch = list(set(batch).difference(set(invalid_ids)))
+
+            uniprot_dict, invalid_gene_names, processed_batch = process_batch(
+                batch
+                uniprot_dict,
+                invalid_gene_names,
+                [],  # pass an empty list
+            )
+
+            if len(processed_batch) != 0:  # batch still causing issues, retry names individually
+                for name in batch:
+                    failed_names[name] = 0
+
+    if len(list(failed_names.keys())) != 0:
+        while len(list(failed_names.keys())) != 0:
+            names_to_process = list(failed_names.keys())
+            for name in names_to_process:
+                uniprot_dict, invalid_gene_names, processed_batch = process_batch(
+                    [name],  # gene name must be in a list
+                    uniprot_dict,
+                    invalid_gene_names,
+                    [],
+                )
+
+                if len(processed_batch) != 0:
+                    failed_names[name] += 1
+
+                if failed_names[name] >= args.retries:
+                    del failed_names[name]
+                    logger.error(
+                        f"Could not retrieved NCBI protein version accession for gene name {name}\n"
+                        "from NCBI\n"
+                        f"Not adding protein data from UniProt accession {gene_names[name]} because\n"
+                        f"could not link its gene name {name} to a NCBI proten record"
+                    )
+
+    return uniprot_dict
+
+
+def process_batch(batch, uniprot_dict, invalid_gene_names, failed_batches):
+    """Coordinate processing batch query results.
+
+    :param batch: list of gene_names
+    :param uniprot_dict: {uniprot_acc: {gene_name: str, protein_name: str, pdb: set, ec: set, sequence:str, seq_data:str}}
+    :param invalid_gene_names: set of gene names not in NCBI
+    :param failed_batches: list of failed batches
+
+    Return uniprot_dict, invalid_gene_names and failed_batches
+    """
+    record, success = query_ncbi(id=",".join(batch))
+
+    if success == 'invalid ID':
+        invalid_gene_names.add(batch[0])
+        continue
+    
+    elif success == 'retry':
+        failed_batches.append(batch)
+        continue
+
+    epost_webenv = epost_result["WebEnv"]
+    epost_query_key = epost_result["QueryKey"]
+
+    record, success = query_ncbi(query_key=epost_query_key, WebEnv=epost_webenv)
+
+    if success == 'invalid ID':
+        invalid_gene_names.add(batch[0])
+        continue
+    
+    elif success == 'retry':
+        failed_batches.append(batch)
+        continue
+
+    for prot_record in tqdm(record, desc="Parsing query output"):
+        gene_name = None
+        uniprot_acc = None
+        genbank_accession = None
+
+        for i in prot_record['GBSeq_feature-table']:
+            if i['GBFeature_key'] == 'CDS':
+                for j in i['GBFeature_intervals']:
+                    gene_name = j['GBInterval_accession'].split(".")[0]
+                    try:
+                        uniprot_acc = gene_names[gene_name]
+
+                    except KeyError:
+                        continue
+                    
+                for k in i['GBFeature_intervals']:
+                    genebank_accession = k['GBInterval_accession'].split(".")[0]
+                    
+        if gene_name is not None:
+            uniprot_dict[uniprot_acc]['genbank_accession'] = genebank_accession
+
+    return uniprot_dict, invalid_gene_names, failed_batches
+
+
+def query_ncbi(id=None, query_key=None, WebEnv=None):
+    """Post IDs or retrieved results from positing IDs to NCBI.
+
+    :param id: str of items separted with single comma
+    :param query_key: str, from posting IDs
+    :param WebEnv: str, from posting IDs
+
+    Return the Entrez record, str indicating if the gene id is valid or the query should 
+    be retried
+    """
+    logger = logging.getLogger(__name__)
+    success = None
+    record = None
+
+    try:
+        if id is not None:  # post IDs
+            epost_result = Entrez.read(
+                entrez_retry(
+                    10,
+                    Entrez.epost,
+                    db="Protein",
+                    id=",".join(batch),
+                ),
+                validate=False,
+            )
+
+        else:
+            with entrez_retry(
+                10,
+                Entrez.efetch,
+                db="Protein",
+                query_key=epost_query_key,
+                WebEnv=epost_webenv,
+                retmode='xml',
+            ) as record_handle:
+                record = Entrez.read(record_handle, validate=False)
+
+    except RuntimeError as err:
+        if repr(err).startswith("RuntimeError('Some IDs have invalid value and were omitted."):
+            if len(batch) == 1:
+                logger.warning(
+                    f"Gene name {batch[0]} retrieved for UniProt entry {gene_names[batch[0]]} "
+                    "is no longer listed in CAZy\n"
+                    f"Not adding protein data for UniProt accessions {gene_names[batch[0]]}"
+                )
+                success = "invalid ID"
+            
+            else:
+                failed_batches.append(batch)
+                logger.warning(
+                    "Batch contains a gene name not in NCBI\n"
+                    "Will identify invalid gene names laters"
+                )
+                success = "retry"
+        else:
+            logger.warning(
+                "Error occurenced when batch quering NCBI\n"
+                "Will retry batch later\n"
+                "Error retrieved:\n"
+                f"{repr(err)}"
+            )
+            success = "retry"
+        continue
+
+    except (TypeError, AttributeError) as err:  # if no record is returned from call to Entrez
+        logger.warning(
+            "Error occurenced when batch quering NCBI\n"
+            "Will retry batch later\n"
+            "Error retrieved:\n"
+            f"{repr(err)}"
+        )
+        success = "retry"
+
+    return record, success

From 50d559e1072539b214e2538a6af12dd172554713 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 15:24:11 +0000
Subject: [PATCH 24/67] update dict keys to separte gene and protein names

---
 .../sql/sql_interface/add_data/add_uniprot_data.py        | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py b/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
index 68ace83f..ea634c00 100644
--- a/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
+++ b/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
@@ -86,8 +86,8 @@ def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
                 
             if args.name_update:
                 # check if the name has changed
-                existing_name = uniprot_table_dict[uniprot_acc]['name']
-                retrieved_name = uniprot_dict[uniprot_acc]['name']
+                existing_name = uniprot_table_dict[uniprot_acc]["protein_name"]
+                retrieved_name = uniprot_dict[uniprot_acc]["protein_name"]
                 
                 if existing_name != retrieved_name:
                     update_record_name.add( (uniprot_acc, retrieved_name) )
@@ -119,7 +119,7 @@ def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
             if args.sequence or args.seq_update:
                 genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]['gbk_acc']
                 gbk_id = gbk_dict[genbank_acc]
-                uniprot_name = uniprot_dict[uniprot_acc]["name"]
+                uniprot_name = uniprot_dict[uniprot_acc]["protein_name"]
                 seq = uniprot_dict[uniprot_acc]["sequence"]
                 date = uniprot_dict[uniprot_acc]["seq_date"]
 
@@ -128,7 +128,7 @@ def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
             else:  # not retrieving protein sequences
                 genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]['gbk_acc']
                 gbk_id = gbk_dict[genbank_acc]
-                uniprot_name = uniprot_dict[uniprot_acc]["name"]
+                uniprot_name = uniprot_dict[uniprot_acc]["protein_name"]
                 
                 uniprot_insert_values.add( (gbk_id, uniprot_acc, uniprot_name) )
     

From 28c290af98b2598e55919fb55fe44e749743a0e4 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 15:24:35 +0000
Subject: [PATCH 25/67] retrieve gene names from uniprot to link gene names and
 genbank protein version accessions

---
 .../expand/uniprot/get_uniprot_data.py        | 88 +++++++++----------
 1 file changed, 41 insertions(+), 47 deletions(-)

diff --git a/cazy_webscraper/expand/uniprot/get_uniprot_data.py b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
index 164a6066..9c73ab88 100644
--- a/cazy_webscraper/expand/uniprot/get_uniprot_data.py
+++ b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
@@ -49,6 +49,7 @@
 from datetime import datetime
 from typing import List, Optional
 
+from Bio import Entrez
 from bioservices import UniProt
 from saintBioutils.misc import get_chunks_list
 from saintBioutils.uniprot import get_uniprot_accessions
@@ -57,6 +58,7 @@
 from tqdm import tqdm
 
 from cazy_webscraper import closing_message, connect_existing_db
+from cazy_webscraper.ncbi.gene_names import get_linked_ncbi_accessions
 from cazy_webscraper.sql import sql_interface
 from cazy_webscraper.sql.sql_interface.get_data import get_selected_gbks
 from cazy_webscraper.sql.sql_interface.add_data.add_uniprot_data import (
@@ -87,6 +89,8 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
     if logger is None:
         logger = logging.getLogger(__name__)
         config_logger(args)
+
+    Entrez.email = args.email
     
     # parse the configuration data (cache the uniprot data as .csv files)
     connection, logger_name, cache_dir = connect_existing_db(args, time_stamp, start_time)
@@ -133,11 +137,14 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
 
     # get cached data, or return an empty dict and set
     uniprot_dict, all_ecs, gbk_data_to_download = get_uniprot_cache(gbk_dict, args)
+    # uniprot_dict = {uniprot_acc: {gene_name: str, protein_name: str, pdb: set, ec: set, sequence:str, seq_data:str}}
+    # all_ecs = set of EC numers
+    # gbk_data_to_download = list of GenBank accs to download data for
 
     if args.skip_download is False:
         logger.warning(f"Retrieving data for {len(gbk_data_to_download)} proteins")
 
-        downloaded_uniprot_data, all_ecs = get_uniprot_data(uniprot_dict, gbk_data_to_download, cache_dir, args)
+        downloaded_uniprot_data, all_ecs = get_uniprot_data(gbk_data_to_download, cache_dir, args)
 
         uniprot_dict.update(downloaded_uniprot_data)
 
@@ -147,6 +154,12 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
     if len(list(downloaded_uniprot_data.keys())) != 0:
         cache_uniprot_data(uniprot_dict, cache_dir, time_stamp)
 
+    # Retrieve the NCBI protein version accession for each gene name retrieved from UniProt
+    # This a more robust method for linking the correct UniProt record to the corresponding 
+    # NCBI protein record than presuming only one record is returned per NCBI protein accession
+    # queried against NCBI
+    uniprot_dict.update(get_linked_ncbi_accessions(uniprot_dict))
+
     # add uniprot accessions (and sequences if seq retrieval is enabled)
     logger.warning("Adding data to the local CAZyme database")
     add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args)
@@ -287,34 +300,29 @@ def get_uniprot_cache(gbk_dict, args):
     return uniprot_dict, all_ecs, gbk_data_to_download
 
 
-def get_uniprot_data(uniprot_dict, gbk_data_to_download, cache_dir, args):
+def get_uniprot_data(gbk_data_to_download, cache_dir, args):
     """Batch query UniProt to retrieve protein data. Save data to cache directory.
     
     Bioservices requests batch queries no larger than 200.
 
-    :param uniprot_dict: dict, keyed by UniProt accession and valued by dict of a GenBank accession
-        and the local CAZyme db record ID {uniprot_acc: {'gbk_acc': str, 'db_id': int}}
     :param gbk_data_to_download: list of NCBI protein accessions to query UniProt with
     :param cache_dir: path to directory to write out cache
     :param args: cmd-line args parser
     
     Return
     Dict of data retrieved from UniProt and to be added to the db 
-        {uniprot_acccession: {gbk_acccession: str, uniprot_name: str, pdb: set, ec: set}}
+        {uniprot_acccession: {gene_name: str, uniprot_name: str, pdb: set, ec: set}}
     Set of all retrieved EC numbers
+    Set of gene names
     """
     logger = logging.getLogger(__name__)
 
-    uniprot_dict = {}  # {uniprot: {genbank_accession: str, uniprot_name: str, pdb: set, ec: set}}
+    uniprot_dict = {}  # {uniprot_acc: {gene_name: str, protein_name: str, pdb: set, ec: set, sequence:str, seq_data:str}}
     all_ecs = set()  # store all retrieved EC numbers as one tuple per unique EC number
-
-    # add PDB column to columns to be retrieved
-    UniProt()._valid_columns.append('database(PDB)')
-
-    print(list(uniprot_gbk_dict.keys()))
-
+    ncbi_gene_names = set()
+    
     bioservices_queries = get_chunks_list(
-        list(uniprot_gbk_dict.keys()),
+        gbk_data_to_download,
         args.bioservices_batch_size,
     )
 
@@ -330,17 +338,29 @@ def get_uniprot_data(uniprot_dict, gbk_data_to_download, cache_dir, args):
 
         index = 0
         uniprot_df = uniprot_df[[
+            'Gene Names (primary)', # used to link UniProt record to GenBank protein accession
             'Entry',
             'Protein names',
             'EC number',
             'Sequence',
             'Date of last sequence modification',
-            'Cross-reference (PDB)',
+            'PDB', 
         ]]
 
-        for index in tqdm(range(len(uniprot_df['Entry'])), desc="Parsing UniProt response"):
+        for index in tqdm(range(len(uniprot_df)), desc="Parsing UniProt response"):
             row = uniprot_df.iloc[index]
+
+            # Parse UniProt accession
             uniprot_acc = row['Entry'].strip()
+
+           # checked if parsed before incase bioservices returned duplicate proteins
+            try:
+                uniprot_dict[uniprot_acc]
+                continue  # already parsed
+            except KeyError:
+                pass
+
+            # Parse UniProt protein name
             try:
                 uniprot_name = row['Protein names'].strip()
 
@@ -348,46 +368,20 @@ def get_uniprot_data(uniprot_dict, gbk_data_to_download, cache_dir, args):
                 logger.warning(
                     f"Protein name {row['Protein names']} was returned as float not string. Converting to string"
                 )
-                uniprot_name = str(row['Protein names'])
+                uniprot_name = str(row['Protein names']).strip()
 
             # remove quotation marks from the protein name, else an SQL error will be raised on insert
             uniprot_name = uniprot_name.replace("'", "")
             uniprot_name = uniprot_name.replace('"', '')
             uniprot_name = uniprot_name.replace("`", "")
 
-            # checked if parsed before incase bioservices returned duplicate proteins
-            try:
-                uniprot_gbk_dict[uniprot_acc]
-            except KeyError:
-                logger.warning(
-                    f"UniProt ID {uniprot_acc} was retrieved from UniProt\n"
-                    "But no corresponding record was found in the local CAZyme database\n"
-                    "Skipping this UniProt ID"
-                )
-                continue  # uniprot ID does not match any retrieved previously
-
-            try:
-                uniprot_dict[uniprot_acc]
-                logger.warning(
-                    f'Multiple entries for UniProt:{uniprot_acc}, '
-                    f'GenBank:{uniprot_gbk_dict[uniprot_acc]} retrieved from UniProt,\n'
-                    'compiling data into a single record'
-                )
-            except KeyError:
-                try:
-                    uniprot_dict[uniprot_acc] = {
-                        "genbank_accession": uniprot_gbk_dict[uniprot_acc],
-                        "name": uniprot_name,
-                    }
-                except KeyError:
-                    logger.warning(
-                        f"Retrieved record with UniProt accession {uniprot_acc} but this "
-                        "accession was not\nretrieved from the UniProt REST API"
-                    )
-                    continue
+            # add protein data to uniprot dict
+            uniprot_dict[uniprot_acc] = {
+                'gene_name': str(row['Gene Names (primary)']).strip(),
+                'protein_name': uniprot_name, 
+            }
             
             if args.ec:
-                # retrieve EC numbers
                 ec_numbers = row['EC number']
                 try:
                     ec_numbers = ec_numbers.split('; ')

From f72ad977e3bddf40d2bb44cb47ea215d2f9ac583 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 15:25:02 +0000
Subject: [PATCH 26/67] add missing option to choices for the api

---
 cazy_webscraper/utilities/parsers/api_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cazy_webscraper/utilities/parsers/api_parser.py b/cazy_webscraper/utilities/parsers/api_parser.py
index 0645e830..ba18bba2 100644
--- a/cazy_webscraper/utilities/parsers/api_parser.py
+++ b/cazy_webscraper/utilities/parsers/api_parser.py
@@ -139,7 +139,7 @@ def build_parser(argv: Optional[List] = None):
         "--include",
         action="store",
         nargs="+",
-        choices=["class", "family", "subfamily", "organism", "uniprot_acc", "uniprot_name", "ec", "pdb", "genbank_seq", "uniprot_seq"],
+        choices=["class", "family", "subfamily", "kingdom", "organism", "uniprot_acc", "uniprot_name", "ec", "pdb", "genbank_seq", "uniprot_seq"],
         help="Additional data to include in the output file. Separate with a single space (' ')"
     )
 

From 055b94731bec286226bc74688f766de28396f5c4 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 15:29:13 +0000
Subject: [PATCH 27/67] add new args to parser

add ncbi and uniprot batch size limiters and user email address for querying NCBI
---
 cazy_webscraper/expand/uniprot/get_uniprot_data.py |  2 +-
 .../utilities/parsers/uniprot_parser.py            | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/cazy_webscraper/expand/uniprot/get_uniprot_data.py b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
index 9c73ab88..8e31b61a 100644
--- a/cazy_webscraper/expand/uniprot/get_uniprot_data.py
+++ b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
@@ -323,7 +323,7 @@ def get_uniprot_data(gbk_data_to_download, cache_dir, args):
     
     bioservices_queries = get_chunks_list(
         gbk_data_to_download,
-        args.bioservices_batch_size,
+        args.uniprot_batch_size,
     )
 
     print(bioservices_queries)
diff --git a/cazy_webscraper/utilities/parsers/uniprot_parser.py b/cazy_webscraper/utilities/parsers/uniprot_parser.py
index 3b92b1ae..47372a03 100644
--- a/cazy_webscraper/utilities/parsers/uniprot_parser.py
+++ b/cazy_webscraper/utilities/parsers/uniprot_parser.py
@@ -62,6 +62,13 @@ def build_parser(argv: Optional[List] = None):
         help="Path to local CAZyme database"
     )
 
+    parser.add_argument(
+        "email",
+        type=str,
+        metavar="user email address",
+        help="User email address, requirement of NCBI-Entrez",
+    )
+
     # Add optional arguments to parser
     parser.add_argument(
         "--bioservices_batch_size",
@@ -312,6 +319,13 @@ def build_parser(argv: Optional[List] = None):
         help="Batch size for queries sent to the UniProt REST API to retrieved UniProt accessions"
     )
 
+    parser.add_argument(
+        "--ncbi_batch_size",
+        type=int,
+        default=150,
+        help="Batch size for queries sent to NCBI Entrez to retrieve protein accessions for gene names retrieved from UniProt"
+    )
+
     # Add option for more detail (verbose) logging
     parser.add_argument(
         "-v",

From 3c10e022552f70d1ec379d1dfbf9c04d278ac23d Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 15:30:16 +0000
Subject: [PATCH 28/67] update version number

---
 setup.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/setup.py b/setup.py
index 568abaca..35edbe5c 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,11 @@
 
 setuptools.setup(
     name="cazy_webscraper",
+<<<<<<< HEAD
     version="2.2.5",
+=======
+    version="2.2.3",
+>>>>>>> update version number
     # Metadata
     author="Emma E. M. Hobbs",
     author_email="eemh1@st-andrews.ac.uk",

From ca7be8bef9c8e40c7248383df7a7d268a0627188 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 15:32:58 +0000
Subject: [PATCH 29/67] update citation information

---
 README.md                   | 2 +-
 cazy_webscraper/__init__.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 261c27c1..644a5227 100644
--- a/README.md
+++ b/README.md
@@ -105,7 +105,7 @@ Please see the [full documentation at ReadTheDocs](https://cazy-webscraper.readt
 
 If you use `cazy_webscraper`, please cite the following publication:
 
-> Hobbs, Emma E. M.; Pritchard, Leighton; Chapman, Sean; Gloster, Tracey M. (2021): cazy_webscraper Microbiology Society Annual Conference 2021 poster. FigShare. Poster. [https://doi.org/10.6084/m9.figshare.14370860.v7](https://doi.org/10.6084/m9.figshare.14370860.v7)
+> Hobbs, E. E. M., Gloster, T. M., and Pritchard, L. (2022) 'cazy_webscraper: local compilation and interrogation of comprehensive CAZyme datasets', _bioRxiv_, [https://doi.org/10.1101/2022.12.02.518825](https://www.biorxiv.org/content/10.1101/2022.12.02.518825v1)
 
 cazy_webscraper depends on a number of tools. To recognise the contributions that the 
 authors and developers have made, please also cite the following:
diff --git a/cazy_webscraper/__init__.py b/cazy_webscraper/__init__.py
index cffc8dd2..43eeab7e 100644
--- a/cazy_webscraper/__init__.py
+++ b/cazy_webscraper/__init__.py
@@ -62,10 +62,10 @@
 
 CITATION_INFO = (
     "If you use cazy_webscraper in your work, please cite the following publication:\n"
-    "\tHobbs, E. E. M., Pritchard, L., Chapman, S., Gloster, T. M.,\n"
-    "\t(2021) cazy_webscraper Microbiology Society Annual Conference 2021 poster.\n"
-    "\tFigShare. Poster.\n"
-    "\thttps://doi.org/10.6084/m9.figshare.14370860.v7"
+    "\tHobbs, E. E. M., Gloster, T. M., and Pritchard, L.\n"
+    "\t(2022) 'cazy_webscraper: local compilation and interrogation of comprehensive CAZyme datasets',\n"
+    "\tbioRxiv\n"
+    "\thttps://doi.org/10.1101/2022.12.02.518825"
 )
 
 WEBSITE = "https://hobnobmancer.github.io/cazy_webscraper/"

From 8e234c529291238a78673b80e102920f7cc9ce5c Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 15:43:40 +0000
Subject: [PATCH 30/67] fix unit tests for expand.unprot

---
 .../expand/uniprot/get_uniprot_data.py          |  2 +-
 cazy_webscraper/ncbi/gene_names/__init__.py     | 17 ++++++++---------
 tests/test_uniprot.py                           |  4 ++++
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/cazy_webscraper/expand/uniprot/get_uniprot_data.py b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
index 8e31b61a..73ce6167 100644
--- a/cazy_webscraper/expand/uniprot/get_uniprot_data.py
+++ b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
@@ -158,7 +158,7 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
     # This a more robust method for linking the correct UniProt record to the corresponding 
     # NCBI protein record than presuming only one record is returned per NCBI protein accession
     # queried against NCBI
-    uniprot_dict.update(get_linked_ncbi_accessions(uniprot_dict))
+    uniprot_dict = get_linked_ncbi_accessions(uniprot_dict)
 
     # add uniprot accessions (and sequences if seq retrieval is enabled)
     logger.warning("Adding data to the local CAZyme database")
diff --git a/cazy_webscraper/ncbi/gene_names/__init__.py b/cazy_webscraper/ncbi/gene_names/__init__.py
index e9cfcdf7..fdb00d3d 100644
--- a/cazy_webscraper/ncbi/gene_names/__init__.py
+++ b/cazy_webscraper/ncbi/gene_names/__init__.py
@@ -57,7 +57,7 @@ def get_linked_ncbi_accessions(uniprot_dict):
 
     gene_names = {}
     for uniprot_acc in uniprot_dict:
-        gene_names[[uniprot_acc]['gene_name']] = uniprot_acc
+        gene_names[uniprot_dict[uniprot_acc]['gene_name']] = uniprot_acc
 
     ncbi_queries = get_chunks_list(
         list(gene_names.keys()),
@@ -69,7 +69,7 @@ def get_linked_ncbi_accessions(uniprot_dict):
 
     for batch in tqdm(ncbi_queries, desc="Batch quering NCBI to get protein accesions for gene names"):
         uniprot_dict, invalid_gene_names, failed_batches = process_batch(
-            batch
+            batch,
             uniprot_dict,
             invalid_gene_names,
             failed_batches,
@@ -83,7 +83,7 @@ def get_linked_ncbi_accessions(uniprot_dict):
             batch = list(set(batch).difference(set(invalid_ids)))
 
             uniprot_dict, invalid_gene_names, processed_batch = process_batch(
-                batch
+                batch,
                 uniprot_dict,
                 invalid_gene_names,
                 [],  # pass an empty list
@@ -133,11 +133,11 @@ def process_batch(batch, uniprot_dict, invalid_gene_names, failed_batches):
 
     if success == 'invalid ID':
         invalid_gene_names.add(batch[0])
-        continue
+        return uniprot_dict, invalid_gene_names, failed_batches
     
     elif success == 'retry':
         failed_batches.append(batch)
-        continue
+        return uniprot_dict, invalid_gene_names, failed_batches
 
     epost_webenv = epost_result["WebEnv"]
     epost_query_key = epost_result["QueryKey"]
@@ -146,11 +146,11 @@ def process_batch(batch, uniprot_dict, invalid_gene_names, failed_batches):
 
     if success == 'invalid ID':
         invalid_gene_names.add(batch[0])
-        continue
+        return uniprot_dict, invalid_gene_names, failed_batches
     
     elif success == 'retry':
         failed_batches.append(batch)
-        continue
+        return uniprot_dict, invalid_gene_names, failed_batches
 
     for prot_record in tqdm(record, desc="Parsing query output"):
         gene_name = None
@@ -165,7 +165,7 @@ def process_batch(batch, uniprot_dict, invalid_gene_names, failed_batches):
                         uniprot_acc = gene_names[gene_name]
 
                     except KeyError:
-                        continue
+                        return uniprot_dict, invalid_gene_names, failed_batches
                     
                 for k in i['GBFeature_intervals']:
                     genebank_accession = k['GBInterval_accession'].split(".")[0]
@@ -238,7 +238,6 @@ def query_ncbi(id=None, query_key=None, WebEnv=None):
                 f"{repr(err)}"
             )
             success = "retry"
-        continue
 
     except (TypeError, AttributeError) as err:  # if no record is returned from call to Entrez
         logger.warning(
diff --git a/tests/test_uniprot.py b/tests/test_uniprot.py
index d755b73c..f2b30f01 100644
--- a/tests/test_uniprot.py
+++ b/tests/test_uniprot.py
@@ -59,6 +59,7 @@
 from cazy_webscraper import utilities
 from cazy_webscraper.cazy_scraper import connect_existing_db
 from cazy_webscraper.expand.uniprot import get_uniprot_data
+from cazy_webscraper.ncbi.gene_names import get_linked_ncbi_accessions
 from cazy_webscraper.sql import sql_interface
 from cazy_webscraper.sql.sql_interface.add_data import add_uniprot_data
 from cazy_webscraper.sql.sql_interface.get_data import get_selected_gbks
@@ -120,6 +121,8 @@ def mock_parser(*args, **kwargs):
             pdb=True,
             skip_uniprot_accessions=None,
             use_uniprot_cache=None,
+            email="dummy_email",
+            skip_download=False,
         )
         return parser
 
@@ -148,6 +151,7 @@ def mock_get_uniprot_data(*args, **kwards):
     monkeypatch.setattr(get_selected_gbks, "get_genbank_accessions", mock_get_genbank_accessions)
     monkeypatch.setattr(get_uniprot_data, "get_uniprot_accessions", mock_get_genbank_accessions)
     monkeypatch.setattr(get_uniprot_data, "get_uniprot_data", mock_get_uniprot_data)
+    monkeypatch.setattr(get_uniprot_data, "get_linked_ncbi_accessions", mock_get_uniprot_data)
     monkeypatch.setattr(get_uniprot_data, "add_uniprot_accessions", mock_return_none)
     monkeypatch.setattr(get_uniprot_data, "add_ec_numbers", mock_return_none)
     monkeypatch.setattr(get_uniprot_data, "add_pdb_accessions", mock_return_none)

From de5a920b37ae710771e5c15b5aaa219002401410 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 15:43:51 +0000
Subject: [PATCH 31/67] fix unit tests for parsers

---
 tests/test_parsers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_parsers.py b/tests/test_parsers.py
index a3067653..47c22d79 100644
--- a/tests/test_parsers.py
+++ b/tests/test_parsers.py
@@ -146,7 +146,7 @@ def test_parser_uniprot():
 
 def test_parser_arsv_uniprot():
     """Test building the parser when argsv is not None"""
-    uniprot_parser.build_parser(["dummy_email"])
+    uniprot_parser.build_parser(["dbpath", "dummy_email"])
 
 
 def test_parser_ncbi_tax():

From aeb48d4af1aa07353f10daaede35f6fd81b8950a Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 15:49:26 +0000
Subject: [PATCH 32/67] update requirements. use bioservices for the new
 uniprot api

---
 requirements.txt | 2 +-
 setup.py         | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index a301fce1..ea817a57 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 beautifulsoup4
 biopython
-bioservices
+bioservices>=1.10.4
 html5lib
 lxml
 mechanicalsoup
diff --git a/setup.py b/setup.py
index 35edbe5c..8882c3cc 100644
--- a/setup.py
+++ b/setup.py
@@ -92,6 +92,7 @@
     },
     install_requires=[
         "biopython>=1.76",
+        "bioservices>=1.10.4",
         "mechanicalsoup",
         "pandas>=1.0.3",
         "pyyaml",

From e3cbe5bcd26e848f1f7de155492d0c417b0fa283 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 15:57:07 +0000
Subject: [PATCH 33/67] do not retry invalid gene names

---
 cazy_webscraper/ncbi/gene_names/__init__.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/cazy_webscraper/ncbi/gene_names/__init__.py b/cazy_webscraper/ncbi/gene_names/__init__.py
index fdb00d3d..d4873eb1 100644
--- a/cazy_webscraper/ncbi/gene_names/__init__.py
+++ b/cazy_webscraper/ncbi/gene_names/__init__.py
@@ -95,6 +95,19 @@ def get_linked_ncbi_accessions(uniprot_dict):
 
     if len(list(failed_names.keys())) != 0:
         while len(list(failed_names.keys())) != 0:
+            # remove invalid ids and don't retry
+            for name in invalid_ids:
+                try:
+                    failed_names[name]
+                    logger.warning(
+                        f"Gene name {name} retrieved from UniProt record {gene_names[name]}\n"
+                        "is no longer in NCBI, therefore, not adding protein data to the\n"
+                        f"local CAZyme database for UniProt record {gene_names[name]}"
+                    )
+                    del failed_names[name]
+                except KeyError:
+                    continue
+
             names_to_process = list(failed_names.keys())
             for name in names_to_process:
                 uniprot_dict, invalid_gene_names, processed_batch = process_batch(

From f80b6ac920edcb4928613371c7b3f9b093106052 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 16:03:51 +0000
Subject: [PATCH 34/67] update documentation

update uniprot args
---
 README.md                |  8 ++++++--
 docs/uniprot.rst         | 22 +++++++++++++++++-----
 docs/uniprottutorial.rst | 29 +++++++++++++++--------------
 3 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index 644a5227..7a26b2d8 100644
--- a/README.md
+++ b/README.md
@@ -310,7 +310,7 @@ Data can be retrieived for all proteins in the local CAZyme database, or a speci
 
 To retrieve all UniProt data for all proteins in a local CAZyme datbase, using the following command:
 ```bash
-cw_get_uniprot_data <path_to_local_CAZyme_db> --ec --pdb --sequence
+cw_get_uniprot_data <path_to_local_CAZyme_db> <user_email> --ec --pdb --sequence
 ```
 
 ### Configuring UniProt data retrieval
@@ -319,7 +319,11 @@ Below are listed the command-line flags for configuring the retrieval of UniProt
 
 `database` - \[REQUIRED\] Path to a local CAZyme database to add UniProt data to.
 
-`--bioservices_batch_size` - Change the query batch size submitted via [`bioservices`](https://bioservices.readthedocs.io/en/master/) to UniProt to retrieve protein data. Default is 150. `bioservices` recommands queries not larger than 200 objects.
+`email` - \[REQUIRED\] User email address. This is required by NCBI Entrez for querying the Entrez server.
+
+`--ncbi_batch_size` - Size of batch query posted to NCBI Entrez. Default 150.
+
+`--uniprot_batch_size` - Change the query batch size submitted via [`bioservices`](https://bioservices.readthedocs.io/en/master/) to UniProt to retrieve protein data. Default is 150. `bioservices` recommands queries not larger than 200 objects.
 
 `--cache_dir` - Path to cache dir to be used instead of default cache dir path.
 
diff --git a/docs/uniprot.rst b/docs/uniprot.rst
index 9acff15e..8ba4eb7b 100644
--- a/docs/uniprot.rst
+++ b/docs/uniprot.rst
@@ -14,7 +14,7 @@ the local CAZyme database, use the following command structure:
 
 .. code-block:: bash
     
-   cw_get_uniprot_data <path to local CAZyme db>
+   cw_get_uniprot_data <path to local CAZyme db> <user email>
 
 .. NOTE::
    The ``cw`` prefix on command is an abbreviation of ``cazy_webscraper``.
@@ -30,7 +30,13 @@ the local CAZyme database, use the following command structure:
 
 .. code-block:: bash
     
-   cw_get_uniprot_data <path to local CAZyme db> --ec --pdb --sequence
+   cw_get_uniprot_data <path to local CAZyme db> <user email> --ec --pdb --sequence
+
+For example:
+
+.. code-block:: bash
+    cw_get_uniprot_data cazy/cazyme.db myemail@domain.com --ec --pdb --sequence
+
 
 --------------------
 Command line options
@@ -40,8 +46,12 @@ Below are listed the required and optional command-line options for configuring
 
 ``database`` - **REQUIRED** Path to a local CAZyme database to add UniProt data to.
 
-``bioservices_batch_size`` - Change the size of the batch query size submitted via `bioservices <https://bioservices.readthedocs.io/en/master/>`_ to UniProt
-to retrieve protein data. Default 150. ``bioservices`` recommends submitting    ueries no larger than 200 objects.
+``email`` - **REQUIRED** User email address, required by NCBI Entrez.
+
+``--ncbi_batch_size`` - Size of batch query posted to NCBI Entrez. Default 150.
+
+``uniprot_batch_size`` - Change the size of the batch query size submitted via `bioservices <https://bioservices.readthedocs.io/en/master/>`_ to UniProt
+to retrieve protein data. Default 150. ``bioservices`` recommends submitting queries no larger than 200 objects.
 
 ``--cache_dir`` - Path to cache dir to be used instead of default cache dir path.
 
@@ -85,7 +95,9 @@ to retrieve protein data. Default 150. ``bioservices`` recommends submitting
 
 ``--seq_update`` - If a newer version of the protein sequence is available, overwrite the existing sequence for the protein in the database. Default is false, the protein sequence is **not** overwritten and updated.
 
-``--skip_uniprot_accessions`` - Path to a JSON file, keyed by UniProt accessions/IDs and valued by dicts containing ``{'gbk_acc': str, 'db_id': int}``. This file part of the cache created by ``cw_get_uniprot_data``. This is option to skip retrieving the UniProt IDs for a set of GenBank accessions, if retrieving data for the same dataset (this save a lot of time!)
+``--use_uniprot_cache`` - Path to a JSON file, keyed by UniProt accessions/IDs and valued by dicts containing `{'gbk_acc': str, 'db_id': int}`. This file part of the cache created by `cw_get_uniprot_data`. This is option to skip retrieving the UniProt IDs for a set of GenBank accessions, if retrieving data for the same dataset (this save a lot of time!)
+
+``skip_download`` - Bool, default False. If set to True, only uses data from UniProt cache and will not download new data from UniProt.
 
 ``--sql_echo`` - Set SQLite engine echo parameter to True, causing SQLite to print log messages. Default: False.
 
diff --git a/docs/uniprottutorial.rst b/docs/uniprottutorial.rst
index 9d9bebbb..530c5454 100644
--- a/docs/uniprottutorial.rst
+++ b/docs/uniprottutorial.rst
@@ -36,13 +36,13 @@ Therefore, ``cw_get_uniprot_data`` can be enabled using a simple command structu
 
 .. code-block:: bash
 
-  cazy_webscraper <path to the local CAZyme db>
+  cazy_webscraper <path to the local CAZyme db> <user email>
 
-For example, if our database was stored in ``cazy/cazyme.db``, we would used:
+For example, if our database was stored in ``cazy/cazyme.db myemail@domain.com``, we would used:
 
 .. code-block:: bash
    
-  cazy_webscraper cazy/cazyme.db
+  cazy_webscraper cazy/cazyme.db myemail@domain.com myemail@domain.com
 
 .. NOTE::
    Make sure ``cw_get_uniprot_data`` is pointed directly at the database file.
@@ -87,13 +87,13 @@ For example, if you want to retrieve protein data for all CAZymes from Glycoside
 
 .. code-block:: bash
 
-   cw_get_uniprot_data cazy/cazyme.db --classes GH,CE
+   cw_get_uniprot_data cazy/cazyme.db myemail@domain.com --classes GH,CE
 
 OR
 
 .. code-block:: bash
 
-   cw_get_uniprot_data cazy/cazyme.db --classes Glycoside Hydrolases,Carbohydrate Esterases
+   cw_get_uniprot_data cazy/cazyme.db myemail@domain.com --classes Glycoside Hydrolases,Carbohydrate Esterases
 
 Retrieving protein data for proteins from specific specific CAZy families is achieved using the ``--families`` flag. For 
 example, to retrieve protein data for all proteins in PL1, PL2 and PL3 in the local CAZyme database use the 
@@ -101,7 +101,7 @@ following command:
 
 .. code-block:: bash
 
-   cw_get_uniprot_data cazy/cazyme.db --families PL1,PL2,PL3
+   cw_get_uniprot_data cazy/cazyme.db myemail@domain.com --families PL1,PL2,PL3
 
 .. WARNING::
    ``cw_get_uniprot_data`` only accpets families written in the proper CAZy family syntax.
@@ -113,13 +113,13 @@ protein data for all CAZymes in PL1, PL2, PL3 and *all* of GH and CE both:
 
 .. code-block:: bash
 
-   cw_get_uniprot_data cazy/cazyme.db --families PL1,PL2,PL3 --classes GH,CE
+   cw_get_uniprot_data cazy/cazyme.db myemail@domain.com --families PL1,PL2,PL3 --classes GH,CE
 
 **AND**
 
 .. code-block:: bash
 
-   cw_get_uniprot_data cazy/cazyme.db --classes GH,CE --families PL1,PL2,PL3
+   cw_get_uniprot_data cazy/cazyme.db myemail@domain.com --classes GH,CE --families PL1,PL2,PL3
 
 are accepted.
 
@@ -137,7 +137,7 @@ For example, if you want to retrieve protein data for all CAZymes in a local CAZ
 
 .. code-block:: bash
 
-   cw_get_uniprot_data cazy/cazyme.db --kingdoms bacteria,eukaryota
+   cw_get_uniprot_data cazy/cazyme.db myemail@domain.com --kingdoms bacteria,eukaryota
 
 .. warning::
    The kingdoms must be spelt the same way CAZy spells them, for example use 'eukaryot**a**' instead of 'eukaryot**e**'.
@@ -155,7 +155,7 @@ we would use would be:
 
 .. code-block:: bash
 
-   cw_get_uniprot_data cazy/cazyme.db --kingdoms viruses --genera Aspergillus --species Layia carnosa,Layia chrysanthemoides --strains Trichoderma reesei QM6a,Trichoderma reesei QM9414
+   cw_get_uniprot_data cazy/cazyme.db myemail@domain.com --kingdoms viruses --genera Aspergillus --species Layia carnosa,Layia chrysanthemoides --strains Trichoderma reesei QM6a,Trichoderma reesei QM9414
 
 .. note::
    The order that the flags are used and the order taxa  are listed does **not** matter, and separate multiple taxa names with a single comma 
@@ -188,7 +188,7 @@ wish to retrieve protein data for CAZymes annotated with specific EC numbers. To
 
 .. code-block:: bash
    
-   cw_get_uniprot_data cazy/cazyme.db --ec_filter "EC1.2.3.4,EC2.3.4.5"
+   cw_get_uniprot_data cazy/cazyme.db myemail@domain.com --ec_filter "EC1.2.3.4,EC2.3.4.5"
 
 
 .. NOTE::
@@ -241,7 +241,7 @@ To retrieve PDB accessions for all CAZymes in GH, GT, CE1, CE5 and CE8, and whic
 
 .. code-block:: bash
 
-   cw_get_uniprot_data cazy/cazyme.db --pdb --classes GH,CE --families CE1,CE5,CE8 --kingdoms bacteria
+   cw_get_uniprot_data cazy/cazyme.db myemail@domain.com --pdb --classes GH,CE --families CE1,CE5,CE8 --kingdoms bacteria
 
 
 **Example 2:**
@@ -249,7 +249,7 @@ To retrieve EC numbers and PDB accessions for all CAZymes in GH and which are de
 
 .. code-block:: bash
 
-   cw_get_uniprot_data cazy/cazyme.db --pdb --ec --classes GH --genera Aspegillus,Trichoderma
+   cw_get_uniprot_data cazy/cazyme.db myemail@domain.com --pdb --ec --classes GH --genera Aspegillus,Trichoderma
 
 
 **Example 3:**
@@ -258,7 +258,7 @@ EC3.2.1.23, EC3.2.1.37 and EC3.2.1.85, we use the command:
 
 .. code-block:: bash
 
-   cw_get_uniprot_data cazy/cazyme.db --ec --sequences --classes GH,CE,CBM --kingdoms bacteria --ec_filter "3.2.1.23,3.2.1.37,3.2.1.85"
+   cw_get_uniprot_data cazy/cazyme.db myemail@domain.com --ec --sequences --classes GH,CE,CBM --kingdoms bacteria --ec_filter "3.2.1.23,3.2.1.37,3.2.1.85"
 
 ------------------------------
 Providing a list of accessions
@@ -283,3 +283,4 @@ then passed to ``cw_get_uniprot_data`` via the ``--genbank_accessions`` flag.
    Therefore, if ``--genbank_accessions`` and ``--classes`` are used, ``cw_get_uniprot_data`` will ignore 
    the ``--classes`` flag and only retrieve protein data for the proteins listed in the file provided via 
    the ``--genbank_accessions``.
+

From 3ec847755b358963953f3eea80163615b45c385a Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 16:03:58 +0000
Subject: [PATCH 35/67] update documentation

update uniprot args
---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7a26b2d8..01e77af2 100644
--- a/README.md
+++ b/README.md
@@ -366,7 +366,9 @@ cw_get_uniprot_data my_cazyme_db/cazyme_db.db --ec_filter 'EC1.2.3.4,EC2.3.1.-'
 
 `--retries`, `-r` - Define the number of times to retry making a connection to CAZy if the connection should fail. Default: 10.
 
-`--skip_uniprot_accessions` - Path to a JSON file, keyed by UniProt accessions/IDs and valued by dicts containing `{'gbk_acc': str, 'db_id': int}`. This file part of the cache created by `cw_get_uniprot_data`. This is option to skip retrieving the UniProt IDs for a set of GenBank accessions, if retrieving data for the same dataset (this save a lot of time!)
+`--use_uniprot_cache` - Path to a JSON file, keyed by UniProt accessions/IDs and valued by dicts containing `{'gbk_acc': str, 'db_id': int}`. This file part of the cache created by `cw_get_uniprot_data`. This is option to skip retrieving the UniProt IDs for a set of GenBank accessions, if retrieving data for the same dataset (this save a lot of time!)
+
+`skip_download` - Bool, default False. If set to True, only uses data from UniProt cache and will not download new data from UniProt.
 
 `--sequence`, `-s` - Retrieve protein amino acid sequences from UniProt
 

From 1b361215358a6b0a65a857a51f917d216e7d4571 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 8 Dec 2022 22:17:34 +0000
Subject: [PATCH 36/67] map gene names when couldn't be retrieved from uniprot

---
 .../expand/uniprot/get_uniprot_data.py        | 39 +++++++++++++--
 cazy_webscraper/ncbi/gene_names/__init__.py   | 50 ++++++++++++-------
 2 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/cazy_webscraper/expand/uniprot/get_uniprot_data.py b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
index 73ce6167..d6fd2596 100644
--- a/cazy_webscraper/expand/uniprot/get_uniprot_data.py
+++ b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
@@ -154,11 +154,13 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
     if len(list(downloaded_uniprot_data.keys())) != 0:
         cache_uniprot_data(uniprot_dict, cache_dir, time_stamp)
 
+    uniprot_dict = get_gene_names(uniprot_dict, args)
     # Retrieve the NCBI protein version accession for each gene name retrieved from UniProt
     # This a more robust method for linking the correct UniProt record to the corresponding 
     # NCBI protein record than presuming only one record is returned per NCBI protein accession
     # queried against NCBI
-    uniprot_dict = get_linked_ncbi_accessions(uniprot_dict)
+    uniprot_dict = get_linked_ncbi_accessions(uniprot_dict, args)
+    print(uniprot_dict)
 
     # add uniprot accessions (and sequences if seq retrieval is enabled)
     logger.warning("Adding data to the local CAZyme database")
@@ -276,6 +278,8 @@ def get_uniprot_cache(gbk_dict, args):
     all_ecs = set()
     gbk_data_to_download = []
 
+    logger = logging.getLogger(__name__)
+
     if args.use_uniprot_cache is not None:
         logger.warning(f"Getting UniProt data from cache: {args.use_uniprot_cache}")
 
@@ -329,7 +333,7 @@ def get_uniprot_data(gbk_data_to_download, cache_dir, args):
     print(bioservices_queries)
 
     for query in tqdm(bioservices_queries, "Batch retrieving protein data from UniProt"):
-        uniprot_df = UniProt().get_df(entries=query, limit=None)
+        uniprot_df = UniProt().get_df(entries=query, limit=args.uniprot_batch_size)
 
         # cache UniProt response
         _time_stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
@@ -401,7 +405,7 @@ def get_uniprot_data(gbk_data_to_download, cache_dir, args):
 
             if args.pdb:
                 # retrieve PDB accessions
-                pdb_accessions = row['Cross-reference (PDB)']
+                pdb_accessions = row['PDB']
                 try:
                     pdb_accessions = pdb_accessions.split(';')
                     pdb_accessions = [pdb.strip() for pdb in pdb_accessions if len(pdb.strip()) > 0]
@@ -490,5 +494,34 @@ def cache_uniprot_data(uniprot_dict, cache_dir, time_stamp):
         json.dump(uniprot_dict, fh) 
 
 
+def get_gene_names(uniprot_dict, args):
+    """Map UniProt accessions to Ensemble-GenBank gene name
+    
+    :param uniprot_dict: {uniprot_acc: {gene_name: str, protein_name: str, pdb: set, ec: set, sequence:str, seq_data:str}}
+    :param args: CLI parser
+    
+    Return UniProt dict
+    """
+    uniprot_acc_to_parse = [acc for acc in uniprot_dict if uniprot_dict[acc]['gene_name'] == 'nan']
+
+    bioservices_queries = get_chunks_list(
+        uniprot_acc_to_parse,
+        args.uniprot_batch_size,
+    )
+
+    for batch in tqdm(bioservices_queries, "Getting gene names from UniProt"):
+        mapping_dict = UniProt().mapping(
+            fr="UniProtKB_AC-ID",
+            to="EMBL-GenBank-DDBJ",
+            query=batch,
+        )
+        for result in mapping_dict['results']:
+            uniprot_acc = result['from']
+            gene_name = result['to']
+            uniprot_dict[uniprot_acc]['gene_name'] = gene_name
+    
+    return uniprot_dict
+
+
 if __name__ == "__main__":
     main()
diff --git a/cazy_webscraper/ncbi/gene_names/__init__.py b/cazy_webscraper/ncbi/gene_names/__init__.py
index d4873eb1..3dd90741 100644
--- a/cazy_webscraper/ncbi/gene_names/__init__.py
+++ b/cazy_webscraper/ncbi/gene_names/__init__.py
@@ -41,18 +41,21 @@
 
 import logging
 
+from Bio import Entrez
 from saintBioutils.genbank import entrez_retry
 from saintBioutils.misc import get_chunks_list
 from tqdm import tqdm
 
 
-def get_linked_ncbi_accessions(uniprot_dict):
+def get_linked_ncbi_accessions(uniprot_dict, args):
     """Get the NCBI protein version accessions for the gene names retrieved from UniProt.
 
     :param uniprot_dict: {uniprot_acc: {gene_name: str, protein_name: str, pdb: set, ec: set, sequence:str, seq_data:str}}
+    :param args: CLI parser
 
     Return uniprot_dict with the ncbi protein accessions added.
     """
+    Entrez.email = args.email
     logger = logging.getLogger(__name__)
 
     gene_names = {}
@@ -64,10 +67,11 @@ def get_linked_ncbi_accessions(uniprot_dict):
         args.ncbi_batch_size,
     )
 
-    invalid_gene_names = set()   # gene names no longer listed in NCBI
+    invalid_gene_names = {'nan'}   # gene names no longer listed in NCBI
     failed_batches = []
 
     for batch in tqdm(ncbi_queries, desc="Batch quering NCBI to get protein accesions for gene names"):
+        batch = list(set(batch).difference(set(invalid_gene_names)))
         uniprot_dict, invalid_gene_names, failed_batches = process_batch(
             batch,
             uniprot_dict,
@@ -80,7 +84,7 @@ def get_linked_ncbi_accessions(uniprot_dict):
 
         # remove invalid IDs
         for batch in failed_batches:
-            batch = list(set(batch).difference(set(invalid_ids)))
+            batch = list(set(batch).difference(set(invalid_gene_names)))
 
             uniprot_dict, invalid_gene_names, processed_batch = process_batch(
                 batch,
@@ -96,7 +100,7 @@ def get_linked_ncbi_accessions(uniprot_dict):
     if len(list(failed_names.keys())) != 0:
         while len(list(failed_names.keys())) != 0:
             # remove invalid ids and don't retry
-            for name in invalid_ids:
+            for name in invalid_gene_names:
                 try:
                     failed_names[name]
                     logger.warning(
@@ -105,6 +109,7 @@ def get_linked_ncbi_accessions(uniprot_dict):
                         f"local CAZyme database for UniProt record {gene_names[name]}"
                     )
                     del failed_names[name]
+                    del uniprot_dict[gene_names[name]]
                 except KeyError:
                     continue
 
@@ -115,6 +120,7 @@ def get_linked_ncbi_accessions(uniprot_dict):
                     uniprot_dict,
                     invalid_gene_names,
                     [],
+                    uniprot_acc=gene_names[name],
                 )
 
                 if len(processed_batch) != 0:
@@ -132,17 +138,18 @@ def get_linked_ncbi_accessions(uniprot_dict):
     return uniprot_dict
 
 
-def process_batch(batch, uniprot_dict, invalid_gene_names, failed_batches):
+def process_batch(batch, uniprot_dict, invalid_gene_names, failed_batches, uniprot_acc=None):
     """Coordinate processing batch query results.
 
     :param batch: list of gene_names
     :param uniprot_dict: {uniprot_acc: {gene_name: str, protein_name: str, pdb: set, ec: set, sequence:str, seq_data:str}}
     :param invalid_gene_names: set of gene names not in NCBI
     :param failed_batches: list of failed batches
+    :param UniProt_acc: str uniprot record entry accession
 
     Return uniprot_dict, invalid_gene_names and failed_batches
     """
-    record, success = query_ncbi(id=",".join(batch))
+    record, success = query_ncbi(batch, gene_names=",".join(batch))
 
     if success == 'invalid ID':
         invalid_gene_names.add(batch[0])
@@ -155,7 +162,7 @@ def process_batch(batch, uniprot_dict, invalid_gene_names, failed_batches):
     epost_webenv = epost_result["WebEnv"]
     epost_query_key = epost_result["QueryKey"]
 
-    record, success = query_ncbi(query_key=epost_query_key, WebEnv=epost_webenv)
+    record, success = query_ncbi(batch, query_key=epost_query_key, WebEnv=epost_webenv)
 
     if success == 'invalid ID':
         invalid_gene_names.add(batch[0])
@@ -189,12 +196,14 @@ def process_batch(batch, uniprot_dict, invalid_gene_names, failed_batches):
     return uniprot_dict, invalid_gene_names, failed_batches
 
 
-def query_ncbi(id=None, query_key=None, WebEnv=None):
+def query_ncbi(batch, gene_names=None, query_key=None, WebEnv=None, uniprot_acc=None):
     """Post IDs or retrieved results from positing IDs to NCBI.
 
-    :param id: str of items separted with single comma
+    :param batch: list of gene names
+    :param gene_names: str of items separted with single comma
     :param query_key: str, from posting IDs
     :param WebEnv: str, from posting IDs
+    :param UniProt_acc: str uniprot record entry accession
 
     Return the Entrez record, str indicating if the gene id is valid or the query should 
     be retried
@@ -204,18 +213,20 @@ def query_ncbi(id=None, query_key=None, WebEnv=None):
     record = None
 
     try:
-        if id is not None:  # post IDs
+        if gene_names is not None:  # post IDs
+            process = "ePost"
             epost_result = Entrez.read(
                 entrez_retry(
                     10,
                     Entrez.epost,
                     db="Protein",
-                    id=",".join(batch),
+                    id=gene_names,
                 ),
                 validate=False,
             )
 
         else:
+            process = "eFetch"
             with entrez_retry(
                 10,
                 Entrez.efetch,
@@ -227,17 +238,16 @@ def query_ncbi(id=None, query_key=None, WebEnv=None):
                 record = Entrez.read(record_handle, validate=False)
 
     except RuntimeError as err:
-        if repr(err).startswith("RuntimeError('Some IDs have invalid value and were omitted."):
+        if repr(err).startswith("RuntimeError('Some IDs have invalid value and were omitted.") or repr(err).startswith("RuntimeError('Empty ID list; Nothing to store')"):
             if len(batch) == 1:
                 logger.warning(
-                    f"Gene name {batch[0]} retrieved for UniProt entry {gene_names[batch[0]]} "
+                    f"Gene name {batch[0]} retrieved for UniProt entry {uniprot_acc} "
                     "is no longer listed in CAZy\n"
-                    f"Not adding protein data for UniProt accessions {gene_names[batch[0]]}"
+                    f"Not adding protein data for UniProt accessions {uniprot_acc}"
                 )
                 success = "invalid ID"
             
             else:
-                failed_batches.append(batch)
                 logger.warning(
                     "Batch contains a gene name not in NCBI\n"
                     "Will identify invalid gene names laters"
@@ -245,19 +255,21 @@ def query_ncbi(id=None, query_key=None, WebEnv=None):
                 success = "retry"
         else:
             logger.warning(
-                "Error occurenced when batch quering NCBI\n"
+                f"Runtime error occurenced when batch quering NCBI ({process})\n"
                 "Will retry batch later\n"
                 "Error retrieved:\n"
-                f"{repr(err)}"
+                f"{repr(err)}\n"
+                f"Batch:\n{batch}"
             )
             success = "retry"
 
     except (TypeError, AttributeError) as err:  # if no record is returned from call to Entrez
         logger.warning(
-            "Error occurenced when batch quering NCBI\n"
+            f"Error occurenced when batch quering NCBI ({process})\n"
             "Will retry batch later\n"
             "Error retrieved:\n"
-            f"{repr(err)}"
+            f"{repr(err)}\n"
+            f"Batch:\n{batch}"
         )
         success = "retry"
 

From 8174df86a8578a69c065670c4193b1a6c49feb5b Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 9 Dec 2022 12:20:50 +0000
Subject: [PATCH 37/67] use the uniprot mapping service

Use the UniProt mapping service to map the uniprot accession to the genbank protein version accession. For the proteins for which that fails, if could not retrieve a gene name from the original UniProt query, map the uniprot accession to the gene name. For proteins for which they could not map the uniprot acc to genbank accession, query ncbi by the gene name to retrieve the genbank accession
---
 .../expand/uniprot/get_uniprot_data.py        | 139 +++++++++++++++++-
 cazy_webscraper/ncbi/gene_names/__init__.py   |  38 +++--
 2 files changed, 156 insertions(+), 21 deletions(-)

diff --git a/cazy_webscraper/expand/uniprot/get_uniprot_data.py b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
index d6fd2596..69623ac1 100644
--- a/cazy_webscraper/expand/uniprot/get_uniprot_data.py
+++ b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
@@ -154,13 +154,22 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
     if len(list(downloaded_uniprot_data.keys())) != 0:
         cache_uniprot_data(uniprot_dict, cache_dir, time_stamp)
 
-    uniprot_dict = get_gene_names(uniprot_dict, args)
-    # Retrieve the NCBI protein version accession for each gene name retrieved from UniProt
-    # This a more robust method for linking the correct UniProt record to the corresponding 
-    # NCBI protein record than presuming only one record is returned per NCBI protein accession
-    # queried against NCBI
-    uniprot_dict = get_linked_ncbi_accessions(uniprot_dict, args)
-    print(uniprot_dict)
+    # get genbank accessions by mapping uniprot accession to ncbi
+    # if can't get genbank accession and did not retrieve a gene name from uniprot
+    # map the uniprot acc to the gene name and then retrieve the genbank accession from ncbi
+    uniprot_dict = get_mapped_genbank_accessions(uniprot_dict, args)
+
+    for uniprot_acc in uniprot_dict:
+        try:
+            uniprot_dict[uniprot_acc]['genbank_accession']
+        except KeyError:
+            logger.error(
+                f"Could not map the UniProt accession '{uniprot_acc}' to a GenBank accession\n"
+                "directly via the UniProt mapping service or via its gene name.\n"
+                f"Not adding protein data for the UniProt accession '{uniprot_acc}' to the\n"
+                "local CAZyme database."
+            )
+            del uniprot_dict[uniprot_acc]
 
     # add uniprot accessions (and sequences if seq retrieval is enabled)
     logger.warning("Adding data to the local CAZyme database")
@@ -494,6 +503,122 @@ def cache_uniprot_data(uniprot_dict, cache_dir, time_stamp):
         json.dump(uniprot_dict, fh) 
 
 
+def get_mapped_genbank_accessions(uniprot_dict, args):
+    """Map uniprot accessions to GenBank protein version accessions.
+    
+    :param uniprot_dict: {uniprot_acc: {gene_name: str, protein_name: str, pdb: set, ec: set, sequence:str, seq_data:str}}
+
+    Return uniprot_dict with GenBank accessions
+    """
+    logger = logging.getLogger(__name__)
+    mapping_batch_size = 25
+
+    bioservices_queries = get_chunks_list(
+        list(uniprot_dict.keys()),
+        mapping_batch_size,
+    )
+
+    failed_ids = set()
+
+    for batch in tqdm(bioservices_queries, desc="Mapping UniProt acc to GenBank acc"):
+        mapping_dict = UniProt().mapping(
+            fr="UniProtKB_AC-ID",
+            to="EMBL-GenBank-DDBJ_CDS",
+            query=batch,
+        )
+        try:
+            for mapped_pair in mapping_dict['results']:
+                uniprot_acc = mapped_pair['from']
+                gbk_acc = mapped_pair['to']
+
+                try:
+                    uniprot_dict[uniprot_acc]['genbank_accession'] = gbk_acc
+                except KeyError:
+                    logger.error(
+                        f"Retrieved UniProt accessions {uniprot_acc} from UniProt but accession was\n"
+                        "not retrieved when quering by GenBank accession"
+                    )
+                    pass
+        except KeyError:
+            pass
+
+        for acc in mapping_dict['failedIds']:
+            if acc in (list(uniprot_dict.keys())):
+                failed_ids.add(acc)
+
+    if len(failed_ids) != 0:
+        logger.warning(f"Could not map {len(failed_ids)} UniProt accessions to GenBank accessions")
+
+        failed_ids_to_parse = set()
+
+        for failed_id in failed_ids:
+            if uniprot_dict[failed_id]['gene_name'] == 'nan':
+                failed_ids_to_parse.add(failed_id)
+
+        if len(failed_ids_to_parse) != 0:
+            failed_ids = set()
+
+            logger.warning(
+                f"Could not map {len(failed_ids_to_parse)} UniProt accessions to GenBank accessions\n"
+                "and could did not retrieve a gene name from UniProt.\n"
+                "Will try mapping the UniProt acc to the gene name"
+            )
+
+            # retry getting gene names if did not have gene names before
+            bioservices_queries = get_chunks_list(
+                list(failed_ids_to_parse),
+                mapping_batch_size,
+            )
+
+            for batch in tqdm(bioservices_queries, desc="Getting gene names"):
+                mapping_dict = UniProt().mapping(
+                    fr="UniProtKB_AC-ID",
+                    to="EMBL-GenBank-DDBJ",
+                    query=batch,
+                )
+
+                try:
+                    for mapped_pair in mapping_dict['results']:
+                        uniprot_acc = mapped_pair['from']
+                        gene_name = mapped_pair['to']
+
+                        try:
+                            uniprot_dict[uniprot_acc]['gene_name'] = gene_name
+                        except KeyError:
+                            logger.error(
+                                f"Retrieved UniProt accessions {uniprot_acc} from UniProt but accession was\n"
+                                "not retrieved when quering by GenBank accession"
+                            )
+                            pass
+                except KeyError:
+                    pass
+
+                try:
+                    for acc in mapping_dict['failedIds']:
+                        if acc in list(uniprot_dict.keys()):
+                            failed_ids.add(acc)
+                except KeyError:
+                    pass
+
+    if len(failed_ids) != 0:
+        for acc in failed_ids:
+            if acc in list(uniprot_dict.keys()):
+                if uniprot_dict[acc]['gene_name'] == 'nan':
+                    logger.error(
+                        f"Could not map the UniProt accession '{acc}' to a NCBI GenBank protein\n"
+                        "accession or gene name, so can't map the UniProt accession to a GenBank\n"
+                        "accession in the local CAZyme database.\n"
+                        f"Not adding protein data for UniProt accession '{acc}' to the local\n"
+                        "CAZyme database"
+                    )
+                    del uniprot_dict[acc]
+
+    uniprot_dict = get_linked_ncbi_accessions(uniprot_dict, args)
+    
+    return uniprot_dict
+    
+
+
 def get_gene_names(uniprot_dict, args):
     """Map UniProt accessions to Ensemble-GenBank gene name
     
diff --git a/cazy_webscraper/ncbi/gene_names/__init__.py b/cazy_webscraper/ncbi/gene_names/__init__.py
index 3dd90741..16390732 100644
--- a/cazy_webscraper/ncbi/gene_names/__init__.py
+++ b/cazy_webscraper/ncbi/gene_names/__init__.py
@@ -58,9 +58,16 @@ def get_linked_ncbi_accessions(uniprot_dict, args):
     Entrez.email = args.email
     logger = logging.getLogger(__name__)
 
-    gene_names = {}
+    gene_names = {}  # to process
     for uniprot_acc in uniprot_dict:
-        gene_names[uniprot_dict[uniprot_acc]['gene_name']] = uniprot_acc
+        try:
+            uniprot_dict[uniprot_acc]['genbank_accession']  # already have the accession
+        except KeyError:
+            if uniprot_dict[uniprot_acc]['gene_name'] != 'nan':
+                gene_names[uniprot_dict[uniprot_acc]['gene_name']] = uniprot_acc
+
+    if len(list(gene_names.keys())) == 0:  # do not need to query NCBI to get the genbank accessions
+        return uniprot_dict
 
     ncbi_queries = get_chunks_list(
         list(gene_names.keys()),
@@ -74,6 +81,7 @@ def get_linked_ncbi_accessions(uniprot_dict, args):
         batch = list(set(batch).difference(set(invalid_gene_names)))
         uniprot_dict, invalid_gene_names, failed_batches = process_batch(
             batch,
+            gene_names,
             uniprot_dict,
             invalid_gene_names,
             failed_batches,
@@ -83,11 +91,12 @@ def get_linked_ncbi_accessions(uniprot_dict, args):
         failed_names = {}
 
         # remove invalid IDs
-        for batch in failed_batches:
+        for batch in tqdm(failed_batches, desc="Retrying failed batches"):
             batch = list(set(batch).difference(set(invalid_gene_names)))
 
             uniprot_dict, invalid_gene_names, processed_batch = process_batch(
                 batch,
+                gene_names,
                 uniprot_dict,
                 invalid_gene_names,
                 [],  # pass an empty list
@@ -114,9 +123,10 @@ def get_linked_ncbi_accessions(uniprot_dict, args):
                     continue
 
             names_to_process = list(failed_names.keys())
-            for name in names_to_process:
+            for name in tqdm(names_to_process, desc="Retrying failed gene names"):
                 uniprot_dict, invalid_gene_names, processed_batch = process_batch(
                     [name],  # gene name must be in a list
+                    gene_names,
                     uniprot_dict,
                     invalid_gene_names,
                     [],
@@ -138,7 +148,7 @@ def get_linked_ncbi_accessions(uniprot_dict, args):
     return uniprot_dict
 
 
-def process_batch(batch, uniprot_dict, invalid_gene_names, failed_batches, uniprot_acc=None):
+def process_batch(batch, gene_names, uniprot_dict, invalid_gene_names, failed_batches, uniprot_acc=None):
     """Coordinate processing batch query results.
 
     :param batch: list of gene_names
@@ -159,10 +169,10 @@ def process_batch(batch, uniprot_dict, invalid_gene_names, failed_batches, unipr
         failed_batches.append(batch)
         return uniprot_dict, invalid_gene_names, failed_batches
 
-    epost_webenv = epost_result["WebEnv"]
-    epost_query_key = epost_result["QueryKey"]
+    epost_webenv = record["WebEnv"]
+    epost_query_key = record["QueryKey"]
 
-    record, success = query_ncbi(batch, query_key=epost_query_key, WebEnv=epost_webenv)
+    record, success = query_ncbi(batch, query_key=epost_query_key, webEnv=epost_webenv)
 
     if success == 'invalid ID':
         invalid_gene_names.add(batch[0])
@@ -196,7 +206,7 @@ def process_batch(batch, uniprot_dict, invalid_gene_names, failed_batches, unipr
     return uniprot_dict, invalid_gene_names, failed_batches
 
 
-def query_ncbi(batch, gene_names=None, query_key=None, WebEnv=None, uniprot_acc=None):
+def query_ncbi(batch, gene_names=None, query_key=None, webEnv=None, uniprot_acc=None):
     """Post IDs or retrieved results from positing IDs to NCBI.
 
     :param batch: list of gene names
@@ -215,7 +225,7 @@ def query_ncbi(batch, gene_names=None, query_key=None, WebEnv=None, uniprot_acc=
     try:
         if gene_names is not None:  # post IDs
             process = "ePost"
-            epost_result = Entrez.read(
+            record = Entrez.read(
                 entrez_retry(
                     10,
                     Entrez.epost,
@@ -231,8 +241,8 @@ def query_ncbi(batch, gene_names=None, query_key=None, WebEnv=None, uniprot_acc=
                 10,
                 Entrez.efetch,
                 db="Protein",
-                query_key=epost_query_key,
-                WebEnv=epost_webenv,
+                query_key=query_key,
+                WebEnv=webEnv,
                 retmode='xml',
             ) as record_handle:
                 record = Entrez.read(record_handle, validate=False)
@@ -241,7 +251,7 @@ def query_ncbi(batch, gene_names=None, query_key=None, WebEnv=None, uniprot_acc=
         if repr(err).startswith("RuntimeError('Some IDs have invalid value and were omitted.") or repr(err).startswith("RuntimeError('Empty ID list; Nothing to store')"):
             if len(batch) == 1:
                 logger.warning(
-                    f"Gene name {batch[0]} retrieved for UniProt entry {uniprot_acc} "
+                    f"Gene name {batch[0]} retrieved for UniProt entry '{uniprot_acc}' "
                     "is no longer listed in CAZy\n"
                     f"Not adding protein data for UniProt accessions {uniprot_acc}"
                 )
@@ -250,7 +260,7 @@ def query_ncbi(batch, gene_names=None, query_key=None, WebEnv=None, uniprot_acc=
             else:
                 logger.warning(
                     "Batch contains a gene name not in NCBI\n"
-                    "Will identify invalid gene names laters"
+                    "Will identify invalid gene names later"
                 )
                 success = "retry"
         else:

From ec0e0ed5f73a01d8d874ee749500fb6efbb3adaa Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 9 Dec 2022 12:23:06 +0000
Subject: [PATCH 38/67] correct typo in variable name genebank to genbank

---
 cazy_webscraper/ncbi/gene_names/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cazy_webscraper/ncbi/gene_names/__init__.py b/cazy_webscraper/ncbi/gene_names/__init__.py
index 16390732..65f9da5f 100644
--- a/cazy_webscraper/ncbi/gene_names/__init__.py
+++ b/cazy_webscraper/ncbi/gene_names/__init__.py
@@ -198,10 +198,10 @@ def process_batch(batch, gene_names, uniprot_dict, invalid_gene_names, failed_ba
                         return uniprot_dict, invalid_gene_names, failed_batches
                     
                 for k in i['GBFeature_intervals']:
-                    genebank_accession = k['GBInterval_accession'].split(".")[0]
+                    genbank_accession = k['GBInterval_accession'].split(".")[0]
                     
         if gene_name is not None:
-            uniprot_dict[uniprot_acc]['genbank_accession'] = genebank_accession
+            uniprot_dict[uniprot_acc]['genbank_accession'] = genbank_accession
 
     return uniprot_dict, invalid_gene_names, failed_batches
 

From 8bfac941c2fb9f0fcf1f322fb9d2c8762552eaf3 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 9 Dec 2022 12:23:38 +0000
Subject: [PATCH 39/67] mock getting gene names

---
 tests/test_uniprot.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_uniprot.py b/tests/test_uniprot.py
index f2b30f01..9ca1df65 100644
--- a/tests/test_uniprot.py
+++ b/tests/test_uniprot.py
@@ -151,6 +151,7 @@ def mock_get_uniprot_data(*args, **kwards):
     monkeypatch.setattr(get_selected_gbks, "get_genbank_accessions", mock_get_genbank_accessions)
     monkeypatch.setattr(get_uniprot_data, "get_uniprot_accessions", mock_get_genbank_accessions)
     monkeypatch.setattr(get_uniprot_data, "get_uniprot_data", mock_get_uniprot_data)
+    monkeypatch.setattr(get_uniprot_data, "get_gene_names", mock_get_uniprot_data)
     monkeypatch.setattr(get_uniprot_data, "get_linked_ncbi_accessions", mock_get_uniprot_data)
     monkeypatch.setattr(get_uniprot_data, "add_uniprot_accessions", mock_return_none)
     monkeypatch.setattr(get_uniprot_data, "add_ec_numbers", mock_return_none)

From 43994ecac40df18da317bbc29f9172854e928ab6 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 9 Dec 2022 12:24:34 +0000
Subject: [PATCH 40/67] mockmapping accessions

---
 tests/test_uniprot.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_uniprot.py b/tests/test_uniprot.py
index 9ca1df65..469c66e1 100644
--- a/tests/test_uniprot.py
+++ b/tests/test_uniprot.py
@@ -152,6 +152,7 @@ def mock_get_uniprot_data(*args, **kwards):
     monkeypatch.setattr(get_uniprot_data, "get_uniprot_accessions", mock_get_genbank_accessions)
     monkeypatch.setattr(get_uniprot_data, "get_uniprot_data", mock_get_uniprot_data)
     monkeypatch.setattr(get_uniprot_data, "get_gene_names", mock_get_uniprot_data)
+    monkeypatch.setattr(get_uniprot_data, "get_mapped_genbank_accessions", mock_get_uniprot_data)
     monkeypatch.setattr(get_uniprot_data, "get_linked_ncbi_accessions", mock_get_uniprot_data)
     monkeypatch.setattr(get_uniprot_data, "add_uniprot_accessions", mock_return_none)
     monkeypatch.setattr(get_uniprot_data, "add_ec_numbers", mock_return_none)

From 6befcfd39daec9bcf75c0a16f43a94653c88ec00 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 9 Dec 2022 12:31:57 +0000
Subject: [PATCH 41/67] correct key names to get genbank acc from uniprot dict

---
 cazy_webscraper/expand/uniprot/get_uniprot_data.py   |  9 ++++++---
 .../sql/sql_interface/add_data/add_uniprot_data.py   | 12 +++++++-----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/cazy_webscraper/expand/uniprot/get_uniprot_data.py b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
index 69623ac1..48e823ba 100644
--- a/cazy_webscraper/expand/uniprot/get_uniprot_data.py
+++ b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
@@ -542,9 +542,12 @@ def get_mapped_genbank_accessions(uniprot_dict, args):
         except KeyError:
             pass
 
-        for acc in mapping_dict['failedIds']:
-            if acc in (list(uniprot_dict.keys())):
-                failed_ids.add(acc)
+        try:
+            for acc in mapping_dict['failedIds']:
+                if acc in (list(uniprot_dict.keys())):
+                    failed_ids.add(acc)
+        except KeyError:
+            pass
 
     if len(failed_ids) != 0:
         logger.warning(f"Could not map {len(failed_ids)} UniProt accessions to GenBank accessions")
diff --git a/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py b/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
index ea634c00..b8973834 100644
--- a/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
+++ b/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
@@ -79,7 +79,7 @@ def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
 
             # check if the GenBank id is the same
             existing_gbk_id = uniprot_table_dict[uniprot_acc]['genbank_id']
-            retrieved_gbk_acc = uniprot_dict[uniprot_acc]["genbank_accession"]['gbk_acc']
+            retrieved_gbk_acc = uniprot_dict[uniprot_acc]["genbank_accession"]
             retrieved_gbk_id = gbk_dict[retrieved_gbk_acc]
             if existing_gbk_id != retrieved_gbk_id:
                 update_record_gbk_id.add( (uniprot_acc, retrieved_gbk_id) )
@@ -117,7 +117,9 @@ def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
         except KeyError:  # new record to add to local CAZyme db
             
             if args.sequence or args.seq_update:
-                genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]['gbk_acc']
+                print(uniprot_dict)
+                print(uniprot_acc)
+                genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]
                 gbk_id = gbk_dict[genbank_acc]
                 uniprot_name = uniprot_dict[uniprot_acc]["protein_name"]
                 seq = uniprot_dict[uniprot_acc]["sequence"]
@@ -126,7 +128,7 @@ def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
                 uniprot_insert_values.add( (gbk_id, uniprot_acc, uniprot_name, seq, date) )
 
             else:  # not retrieving protein sequences
-                genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]['gbk_acc']
+                genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]
                 gbk_id = gbk_dict[genbank_acc]
                 uniprot_name = uniprot_dict[uniprot_acc]["protein_name"]
                 
@@ -215,7 +217,7 @@ def add_ec_numbers(uniprot_dict, all_ecs, gbk_dict, connection, args):
     gbk_ec_insert_values = set()
 
     for uniprot_acc in tqdm(uniprot_dict, desc="Updating EC numbers"):
-        genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]['gbk_acc']
+        genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]
         gbk_id = gbk_dict[genbank_acc]
         
         retrieved_ec_numbers = uniprot_dict[uniprot_acc]["ec"]  # EC#s retrieved from UniProt
@@ -296,7 +298,7 @@ def add_pdb_accessions(uniprot_dict, gbk_dict, connection, args):
     # to identify pdb-protein relationships retrieved from UniProt
     gbk_pdb_insert_values = set()
     for uniprot_acc in tqdm(uniprot_dict, desc="Identifying new protein-PDB relationships to add to db"):
-        genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]['gbk_acc']
+        genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]
         gbk_db_id = gbk_dict[genbank_acc]
 
         # set of pdb_accessions retrieved from UniProt

From 9d4a721f411d67564f9e2707d41a9f7b4b4108ee Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 9 Dec 2022 12:38:53 +0000
Subject: [PATCH 42/67] check the retrieved gbk acc is in the local db before
 adding protein data

---
 .../sql/sql_interface/add_data/add_uniprot_data.py   | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py b/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
index b8973834..27acef3a 100644
--- a/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
+++ b/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
@@ -72,6 +72,18 @@ def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
     update_record_seq = set()  # ((uniprot_accession, retrieved_seq), )
 
     for uniprot_acc in tqdm(uniprot_dict, desc="Separating new and existing records"):
+        # check the genbank accession is in the local cazyme database
+        retrieved_gbk_acc = uniprot_dict[uniprot_acc]["genbank_accession"]
+        try:
+            gbk_dict[retrieved_gbk_acc]
+        except KeyError:
+            logger.error(
+                f"Mapped the GenBank accession '{retrieved_gbk_acc}' to the UniProt accession\n"
+                f"'{uniprot_acc}' but the GenBank accession is not in the local CAZyme database\n"
+                f"therefore, not adding protein data for GBK:{retrieved_gbk_acc}/UniProt:{uniprot_acc}"
+                "to the local CAZyme database."
+            )
+            continue
         # check if the UniProt accession is already in the local CAZyme db
         try:
             uniprot_table_dict[uniprot_acc]

From d3fa24d7fb3d3ee86c2b3c80db4d94330ff08239 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 9 Dec 2022 12:41:29 +0000
Subject: [PATCH 43/67] log addition of data to the local db

---
 .../sql/sql_interface/add_data/add_uniprot_data.py          | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py b/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
index 27acef3a..d794c865 100644
--- a/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
+++ b/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
@@ -60,6 +60,8 @@ def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
 
     Return nothing.
     """
+    logger = logging.getLogger(__name__)
+
     # load the current Uniprot records in the local CAZyme db
     # {acc: {name: str, gbk_id: int, seq: str, seq_date:str } }
     uniprot_table_dict = get_table_dicts.get_uniprot_table_dict(connection)
@@ -147,6 +149,7 @@ def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
                 uniprot_insert_values.add( (gbk_id, uniprot_acc, uniprot_name) )
     
     if len(uniprot_insert_values) != 0:
+        logger.warning(f"Inserting {len(uniprot_insert_values)} into the local CAZyme db")
         if args.sequence or args.seq_update:
             columns = ['genbank_id', 'uniprot_accession', 'uniprot_name', 'sequence', 'seq_update_date']
         else:
@@ -155,6 +158,7 @@ def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
         insert_data(connection, "Uniprots", columns, list(uniprot_insert_values))
 
     if len(update_record_gbk_id) != 0:
+        logger.warning(f"Updating {len(update_record_gbk_id)} Gbk-UniProt relationships in the local CAZyme db")
         with connection.begin():
             for record in tqdm(update_record_gbk_id, desc="Updating UniProt-Gbk relationships"):
                 connection.execute(
@@ -166,6 +170,7 @@ def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
                 )
         
     if len(update_record_name) != 0:
+        logger.warning(f"Updating {len(update_record_name)} UniProt protein names in the local CAZyme db")
         with connection.begin():
             for record in tqdm(update_record_name, desc="Updating UniProt protein names"):
                 connection.execute(
@@ -177,6 +182,7 @@ def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
                 )
 
     if len(update_record_seq) != 0:
+        logger.warning(f"Updating {len(update_record_seq)} UniProt protein sequences in the local CAZyme db")
         with connection.begin():
             for record in tqdm(update_record_seq, desc="Updating UniProt protein seqs"):
                 connection.execute(

From e64489ea4c40983a366141713c371983a26caa3b Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 9 Dec 2022 12:44:29 +0000
Subject: [PATCH 44/67] check the retrieved gbk acc is in the local db before
 adding protein data

---
 .../add_data/add_uniprot_data.py              | 25 ++++++++++++++++---
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py b/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
index d794c865..587a28cc 100644
--- a/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
+++ b/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
@@ -131,8 +131,6 @@ def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
         except KeyError:  # new record to add to local CAZyme db
             
             if args.sequence or args.seq_update:
-                print(uniprot_dict)
-                print(uniprot_acc)
                 genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]
                 gbk_id = gbk_dict[genbank_acc]
                 uniprot_name = uniprot_dict[uniprot_acc]["protein_name"]
@@ -236,7 +234,16 @@ def add_ec_numbers(uniprot_dict, all_ecs, gbk_dict, connection, args):
 
     for uniprot_acc in tqdm(uniprot_dict, desc="Updating EC numbers"):
         genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]
-        gbk_id = gbk_dict[genbank_acc]
+        try:
+            gbk_id = gbk_dict[genbank_acc]
+        except KeyError:
+            logger.error(
+                f"Mapped the GenBank accession '{retrieved_gbk_acc}' to the UniProt accession\n"
+                f"'{uniprot_acc}' but the GenBank accession is not in the local CAZyme database\n"
+                f"therefore, not adding protein data for GBK:{retrieved_gbk_acc}/UniProt:{uniprot_acc}"
+                "to the local CAZyme database."
+            )
+            continue
         
         retrieved_ec_numbers = uniprot_dict[uniprot_acc]["ec"]  # EC#s retrieved from UniProt
         for ec in retrieved_ec_numbers:
@@ -317,7 +324,17 @@ def add_pdb_accessions(uniprot_dict, gbk_dict, connection, args):
     gbk_pdb_insert_values = set()
     for uniprot_acc in tqdm(uniprot_dict, desc="Identifying new protein-PDB relationships to add to db"):
         genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]
-        gbk_db_id = gbk_dict[genbank_acc]
+
+        try:
+            gbk_db_id = gbk_dict[genbank_acc]
+        except KeyError:
+            logger.error(
+                f"Mapped the GenBank accession '{retrieved_gbk_acc}' to the UniProt accession\n"
+                f"'{uniprot_acc}' but the GenBank accession is not in the local CAZyme database\n"
+                f"therefore, not adding protein data for GBK:{retrieved_gbk_acc}/UniProt:{uniprot_acc}"
+                "to the local CAZyme database."
+            )
+            continue
 
         # set of pdb_accessions retrieved from UniProt
         retrieved_pdbs = uniprot_dict[uniprot_acc]["pdb"]

From 3a1e5b4dd21b2b43e9db0c6dafff0c140ed1f825 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 9 Dec 2022 12:45:35 +0000
Subject: [PATCH 45/67] update variable name to print gbk not in lcoal db

---
 .../sql/sql_interface/add_data/add_uniprot_data.py        | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py b/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
index 587a28cc..007a9322 100644
--- a/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
+++ b/cazy_webscraper/sql/sql_interface/add_data/add_uniprot_data.py
@@ -238,9 +238,9 @@ def add_ec_numbers(uniprot_dict, all_ecs, gbk_dict, connection, args):
             gbk_id = gbk_dict[genbank_acc]
         except KeyError:
             logger.error(
-                f"Mapped the GenBank accession '{retrieved_gbk_acc}' to the UniProt accession\n"
+                f"Mapped the GenBank accession '{genbank_acc}' to the UniProt accession\n"
                 f"'{uniprot_acc}' but the GenBank accession is not in the local CAZyme database\n"
-                f"therefore, not adding protein data for GBK:{retrieved_gbk_acc}/UniProt:{uniprot_acc}"
+                f"therefore, not adding protein data for GBK:{genbank_acc}/UniProt:{uniprot_acc}"
                 "to the local CAZyme database."
             )
             continue
@@ -329,9 +329,9 @@ def add_pdb_accessions(uniprot_dict, gbk_dict, connection, args):
             gbk_db_id = gbk_dict[genbank_acc]
         except KeyError:
             logger.error(
-                f"Mapped the GenBank accession '{retrieved_gbk_acc}' to the UniProt accession\n"
+                f"Mapped the GenBank accession '{genbank_acc}' to the UniProt accession\n"
                 f"'{uniprot_acc}' but the GenBank accession is not in the local CAZyme database\n"
-                f"therefore, not adding protein data for GBK:{retrieved_gbk_acc}/UniProt:{uniprot_acc}"
+                f"therefore, not adding protein data for GBK:{genbank_acc}/UniProt:{uniprot_acc}"
                 "to the local CAZyme database."
             )
             continue

From 82e9baf6f3aaddc1b81667c3df42ce937b93e2d9 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 9 Dec 2022 15:01:46 +0000
Subject: [PATCH 46/67] update unit tests

---
 .../expand/uniprot/get_uniprot_data.py           |  5 ++++-
 tests/test_uniprot.py                            | 16 ++++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/cazy_webscraper/expand/uniprot/get_uniprot_data.py b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
index 48e823ba..2b0c020d 100644
--- a/cazy_webscraper/expand/uniprot/get_uniprot_data.py
+++ b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
@@ -159,6 +159,7 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
     # map the uniprot acc to the gene name and then retrieve the genbank accession from ncbi
     uniprot_dict = get_mapped_genbank_accessions(uniprot_dict, args)
 
+    acc_to_remove = set()
     for uniprot_acc in uniprot_dict:
         try:
             uniprot_dict[uniprot_acc]['genbank_accession']
@@ -169,7 +170,9 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
                 f"Not adding protein data for the UniProt accession '{uniprot_acc}' to the\n"
                 "local CAZyme database."
             )
-            del uniprot_dict[uniprot_acc]
+            acc_to_remove.add(uniprot_acc)
+    for uniprot_acc in acc_to_remove:
+        del uniprot_dict[uniprot_acc]
 
     # add uniprot accessions (and sequences if seq retrieval is enabled)
     logger.warning("Adding data to the local CAZyme database")
diff --git a/tests/test_uniprot.py b/tests/test_uniprot.py
index 469c66e1..fa346764 100644
--- a/tests/test_uniprot.py
+++ b/tests/test_uniprot.py
@@ -139,8 +139,20 @@ def mock_get_genbank_accessions(*args, **kwards):
         return {1: 1, 2:2, 3:3}
     
     def mock_get_uniprot_data(*args, **kwards):
-        return {1: {'ec': {1,2,3}, 'pdb': {1,2,3}}, 2: {'ec': {1,2,3}, 'pdb': {1,2,3}}, 3: {'ec': {1,2,3}, 'pdb': {1,2,3}}}, {1, 2, 3}
+        return (
+            {
+                1: {'genbank_accession': 1, 'gene_name': 1, 'ec': {1,2,3}, 'pdb': {1,2,3}},
+                2: {'gene_name': 'nan', 'ec': {1,2,3}, 'pdb': {1,2,3}}, 3: {'ec': {1,2,3}, 'pdb': {1,2,3}}
+            },
+            {1, 2, 3}
+        )
 
+    def mock_uniprot_data(*args, **kwards):
+        return {
+                1: {'genbank_accession': 1, 'gene_name': 1, 'ec': {1,2,3}, 'pdb': {1,2,3}},
+                2: {'gene_name': 'nan', 'ec': {1,2,3}, 'pdb': {1,2,3}}, 3: {'ec': {1,2,3}, 'pdb': {1,2,3}}
+            }
+    
     monkeypatch.setattr(uniprot_parser, "build_parser", mock_building_parser)
     monkeypatch.setattr(ArgumentParser, "parse_args", mock_parser)
     monkeypatch.setattr(saint_logger, "config_logger", mock_return_logger)
@@ -152,7 +164,7 @@ def mock_get_uniprot_data(*args, **kwards):
     monkeypatch.setattr(get_uniprot_data, "get_uniprot_accessions", mock_get_genbank_accessions)
     monkeypatch.setattr(get_uniprot_data, "get_uniprot_data", mock_get_uniprot_data)
     monkeypatch.setattr(get_uniprot_data, "get_gene_names", mock_get_uniprot_data)
-    monkeypatch.setattr(get_uniprot_data, "get_mapped_genbank_accessions", mock_get_uniprot_data)
+    monkeypatch.setattr(get_uniprot_data, "get_mapped_genbank_accessions", mock_uniprot_data)
     monkeypatch.setattr(get_uniprot_data, "get_linked_ncbi_accessions", mock_get_uniprot_data)
     monkeypatch.setattr(get_uniprot_data, "add_uniprot_accessions", mock_return_none)
     monkeypatch.setattr(get_uniprot_data, "add_ec_numbers", mock_return_none)

From d2ff45467aa95bac02e18ccba3d1227243e124c5 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 9 Dec 2022 15:08:43 +0000
Subject: [PATCH 47/67] cache uniprot accessions that could not be mapped to
 genbank

---
 .../expand/uniprot/get_uniprot_data.py        | 31 +++++++++++--------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/cazy_webscraper/expand/uniprot/get_uniprot_data.py b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
index 2b0c020d..a155828b 100644
--- a/cazy_webscraper/expand/uniprot/get_uniprot_data.py
+++ b/cazy_webscraper/expand/uniprot/get_uniprot_data.py
@@ -157,7 +157,7 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
     # get genbank accessions by mapping uniprot accession to ncbi
     # if can't get genbank accession and did not retrieve a gene name from uniprot
     # map the uniprot acc to the gene name and then retrieve the genbank accession from ncbi
-    uniprot_dict = get_mapped_genbank_accessions(uniprot_dict, args)
+    uniprot_dict = get_mapped_genbank_accessions(uniprot_dict, cache_dir, args)
 
     acc_to_remove = set()
     for uniprot_acc in uniprot_dict:
@@ -506,15 +506,18 @@ def cache_uniprot_data(uniprot_dict, cache_dir, time_stamp):
         json.dump(uniprot_dict, fh) 
 
 
-def get_mapped_genbank_accessions(uniprot_dict, args):
+def get_mapped_genbank_accessions(uniprot_dict, cache_dir, args):
     """Map uniprot accessions to GenBank protein version accessions.
     
     :param uniprot_dict: {uniprot_acc: {gene_name: str, protein_name: str, pdb: set, ec: set, sequence:str, seq_data:str}}
+    :param cache_dir: path to cache directory
+    :param args: CLI parser
 
     Return uniprot_dict with GenBank accessions
     """
     logger = logging.getLogger(__name__)
     mapping_batch_size = 25
+    cache_path = cache_dir / "failed_mapped_uniprot_ids"
 
     bioservices_queries = get_chunks_list(
         list(uniprot_dict.keys()),
@@ -607,17 +610,19 @@ def get_mapped_genbank_accessions(uniprot_dict, args):
                     pass
 
     if len(failed_ids) != 0:
-        for acc in failed_ids:
-            if acc in list(uniprot_dict.keys()):
-                if uniprot_dict[acc]['gene_name'] == 'nan':
-                    logger.error(
-                        f"Could not map the UniProt accession '{acc}' to a NCBI GenBank protein\n"
-                        "accession or gene name, so can't map the UniProt accession to a GenBank\n"
-                        "accession in the local CAZyme database.\n"
-                        f"Not adding protein data for UniProt accession '{acc}' to the local\n"
-                        "CAZyme database"
-                    )
-                    del uniprot_dict[acc]
+        with open(cache_path, "w") as fh:
+            for acc in failed_ids:
+                if acc in list(uniprot_dict.keys()):
+                    if uniprot_dict[acc]['gene_name'] == 'nan':
+                        logger.error(
+                            f"Could not map the UniProt accession '{acc}' to a NCBI GenBank protein\n"
+                            "accession or gene name, so can't map the UniProt accession to a GenBank\n"
+                            "accession in the local CAZyme database.\n"
+                            f"Not adding protein data for UniProt accession '{acc}' to the local\n"
+                            "CAZyme database"
+                        )
+                        del uniprot_dict[acc]
+                        fh.write(f"Could not map UniProt accession '{acc}' to a GenBank accession or gene name\n")
 
     uniprot_dict = get_linked_ncbi_accessions(uniprot_dict, args)
     

From c8474e254b5c614ba22435af41983d36ab5954db Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Thu, 5 Jan 2023 12:44:46 +0000
Subject: [PATCH 48/67] expand error catching when querying ncbi

catch runtime errors, incomplete reads, notxmlerorr, typeerror and attributeerror. Improve detection of invalid IDs. Working on not retrying conenction for invalid IDs. Invalid Ids are kepy searate ffrom IDs whose querying to NCBI was interrupted by a connection failure
---
 .../sequences/get_genbank_sequences.py        | 552 +++++++++++++-----
 1 file changed, 403 insertions(+), 149 deletions(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index f2b92435..aec0dce1 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -125,8 +125,22 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
             args,
         )
 
-    seq_dict = get_cache_seqs(start_time, args)
-    gbk_dict = {}
+    seq_dict = get_cache_seqs(start_time, args)  # {genbank_acc: Bio.Seq}
+    # Get GenBank accessions from a file or records matching the provided criteria, and get list
+    # of genbank accesions for proteins for whom seqs need to be downloaded from NCBI
+    # gbk_dict = {gbk_acc: db id}
+    # accs_seqs_to_retrieve = [list of gbk acc to query ncbi with]
+    gbk_dict, accs_seqs_to_retrieve = get_records_to_retrieve(
+        class_filters,
+        family_filters,
+        taxonomy_filter_dict,
+        kingdom_filters,
+        ec_filters,
+        seq_dict,
+        start_time,
+        connection,
+        args,
+    )
 
     if args.file_only:
         logger.warning(
@@ -135,15 +149,10 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
         )
 
     else:  # retrieve data from NCBI for seqs in the local db
-        seq_dict, gbk_dict = get_seqs_from_ncbi(
-            class_filters,
-            family_filters,
-            taxonomy_filter_dict,
-            kingdom_filters,
-            ec_filters,
+        seq_dict = get_seqs_from_ncbi(
+            accs_seqs_to_retrieve
             seq_dict,
             start_time,
-            connection,
             cache_dir,
             args,
         )
@@ -248,17 +257,27 @@ def get_records_to_retrieve(
     connection,
     args,
 ):
-    """Get Genbank accessions to retrieved data from NCBI for
+    """Get Genbank accessions to retrieved data for from NCBI and genbank accessions and local 
+    database IDs for the GenBank accessions. Remove seqs that are not in the local db.
 
     :param seq_dict: dict {id: seq} of seqs retrieved from cache json/fasta files
     :param start_time: str: time program was called
     :param connection: open connection to a SQLite db engine
     :param args: CLI args parser
 
-    Return a dict {gbk_acc: gbk db id
+    Return a dict {gbk_acc: gbk db id} and list of genbank accs to retrieve seqs for.
     """
     logger = logging.getLogger(__name__)
 
+    all_accessions = list(seq_dict.keys())
+
+    if args.file_only:
+        gbk_dict = get_selected_gbks.get_ids(all_accessions, connection)
+        accs_to_retrieve = []
+        return gbk_dict, accs_seqs_to_retrieve
+
+    accessions = []
+
     # retrieve dict of genbank accession and genbank db ids from the local CAZyme db
     if args.genbank_accessions is not None:
         logger.warning(f"Getting GenBank accessions from file: {args.genbank_accessions}")
@@ -276,12 +295,11 @@ def get_records_to_retrieve(
             closing_message("Get GenBank seqs", start_time, args, early_term=True)
 
         accessions = [line.strip() for line in lines if line not in list(seq_dict.keys())]
-        accessions = set(accessions)
-
-        gbk_dict = get_selected_gbks.get_ids(accessions, connection)
+        all_accessions += list(set(accessions))
 
     else:
-        gbk_dict = get_selected_gbks.get_genbank_accessions(
+        logger.warning("Getting GenBank accessions of proteins matching the provided criteria from the local CAZyme db")
+        selected_gbk_dict = get_selected_gbks.get_genbank_accessions(
             class_filters,
             family_filters,
             taxonomy_filter_dict,
@@ -290,61 +308,117 @@ def get_records_to_retrieve(
             connection,
         )
 
-        gbk_accs = list(gbk_dict.keys())
+        accessions = list(gbk_dict.keys())
+        all_accessions += accessions
 
         # don't retrieve data for cached seqs
         for gbk_acc in gbk_accs:
             if gbk_acc in list(seq_dict.keys()):
                 del gbk_dict[gbk_acc]
 
-    return gbk_dict
+    gbk_dict = get_selected_gbks.get_ids(all_accessions, connection)
+    accs_to_retrieve = [for acc in all_accessions if acc not in list(seq_dict.keys())]
+
+    return gbk_dict, accs_to_retrieve
 
 
 def get_seqs_from_ncbi(
-    class_filters,
-    family_filters,
-    taxonomy_filter_dict,
-    kingdom_filters,
-    ec_filters,
+    accs_seqs_to_retrieve
     seq_dict,
     start_time,
-    connection,
     cache_dir,
     args,
 ):
     """Coordinate retrieving sequence data from NCBI for proteins not retrieved from cache files
 
+    :param accs_seqs_to_retrieve: list of gbk accs of protein seqs to retrieve from ncbi
     :param seq_dict: dict {id: seq} of seqs retrieved from cache json/fasta files
     :param start_time: str: time program was called
-    :param connection: open connection to a SQLite db engine
+    :param cache_dir: path to cache directory
     :param args: CLI args parser
 
-    Return dicts of {acc: Bio.Seq} and {gbk acc: db id}
+    Return updated seqs_dict, {acc: Bio.Seq}, containing seqs retrieved from NCBI
     """
     logger = logging.getLogger(__name__)
 
-    gbk_dict = get_records_to_retrieve(
-        class_filters,
-        family_filters,
-        taxonomy_filter_dict,
-        kingdom_filters,
-        ec_filters,
-        seq_dict,
-        start_time,
-        connection,
-        args,
-    )
+    logger.warning(f"Retrieving GenBank sequences from NCBI for {len(accs_seqs_to_retrieve)}")
 
-    genbank_accessions = list(gbk_dict.keys())
-    logger.warning(f"Retrieving GenBank sequences from NCBI for {len(gbk_dict.keys())}")
-
-    if len(genbank_accessions) == 0:
+    if len(accs_seqs_to_retrieve) == 0:
         return seq_dict
 
-    cache_path = cache_dir / f"genbank_seqs_{start_time}.fasta"
+    cache_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    cache_path = cache_dir / f"cw_genbank_seqs_{cache_time}.fasta"
+    invalid_ids_cache = cache_dir / "invalid_ids"
 
     # break up long list into managable chunks
-    all_queries = get_chunks_list(genbank_accessions, args.batch_size)
+    batches = get_chunks_list(accs_seqs_to_retrieve, args.batch_size)
+
+    (
+        seq_records,
+        batches_with_invalid_ids,
+        failed_connections_batches,
+        accs_still_to_fetch,
+    ) = get_seqs(batches, cache_path, invalid_ids_cache, args)
+
+    if len(failed_connections_batches) > 0:
+        # Retry failed batches before parsing invalid IDs as the failed connections
+        # may contain invalid IDs
+        (
+            seq_records,
+            batches_with_invalid_ids,
+            accs_still_to_fetch,
+        ) = parse_failed_connections(
+            failed_connections_batches,
+            seq_records,
+            accs_still_to_fetch,
+            cache_path,
+            invalid_ids_cache, 
+            args,
+        )
+
+    if len(batches_with_invalid_ids) > 0:
+        ## retry IDs individually and identify invalid IDs
+        # break up batches into batches of len 1 (1 acc per list/batch)
+        individual_batches = []
+        for batch in batches_with_invalid_ids:
+            for acc in batch:
+                individual_batches.append([acc])
+        
+        seq_records, accs_still_to_fetch = parse_invalid_id_batches(
+            batches_with_invalid_ids,
+            cache_path,
+            invalid_ids_cache,
+            args,
+        )
+
+
+        
+
+    if len(accs_still_to_fetch) > 0:
+        logger.error(
+            f"Could not fetch seqs for {len(accs_still_to_fetch)} NCBI protein accessions.\n"
+            "Caching these protein accessions"
+        )
+        failed_ids_cache = cache_dir / "failed_retrieve_ids"
+        with open(failed_ids_cache, "a") as fh:
+            for acc in accs_still_to_fetch:
+                fh.write(f"{acc}\n")
+
+    # add seq records to seq dict
+    for record in seq_records:
+        try:
+            seq = seq_dict[record.id]
+            if seq != record.seq:
+                logger.warning(
+                    f""
+                )
+            seq_dict[record.id] = record.seq
+        except KeyError:
+            seq_dict[record.id] = record.seq
+
+    logger.warning(f"Retrieved {len(seq_records)} from NCBI")
+    
+    return seq_dict
 
     # list of downloaded SeqRecords and list of gbk acc for which no seq was retrieved from NCBI
     downloaded_seqs, failed_queries = get_sequences(all_queries, cache_dir, args)
@@ -433,163 +507,343 @@ def get_seqs_from_ncbi(
     return seq_dict, gbk_dict
 
 
-def get_sequences(batches, cache_dir, args, disable_prg_bar=False):
+def get_seqs(batches, cache_path, invalid_ids_cache, args, disable_prg_bar=False):
     """Retrieve protein sequences from Entrez.
 
     :param batches: list of lists, one list be batch of gbk acc to query against NCBI
-    :param cache_dir: Path, to cache directory
+    :param cache_path: Path, to cache fasta file or protein seqs
+    :param invalid_id_cache: Path to file to cache invalid IDs
     :param args: cmb-line args parser
     failed query
 
     Return
     * list of SeqRecords
-    * list of failed batches
+    * list of nested lists of batches containing one or more invalid IDs (i.e. no longer listd in NCBI)
+    * dict of batches which suffered failed connections when querying NCBI
+    * list of accessions for whom a seq has not been retrieved yet
     """
     logger = logging.getLogger(__name__)
 
-    cache_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    cache_path = cache_dir / f"genbank_seqs_{cache_time}.fasta"
-
     gbk_acc_to_retrieve = []
     for batch in batches:
         gbk_acc_to_retrieve += [acc for acc in batch]
 
-    seq_records = []  # SeqRecords, can't be set, SeqRecords are unhasable
+    invalid_ids = set()
+    batches_with_invalid_ids = []  # nested lists of batches with invalid IDs
+    failed_connections_batches = []  # nested list of batches during the querying of which the connection failed
 
-    failed_queries = []  # lists which raised an error,
-    # likely because contain an accession not in NCBI
+    seq_records = []
 
-    irregular_accessions = []
-
-    success_accessions = set()  # accessions for which seqs were retrieved
+    successful_accessions = []  # accessions for which seqs were retrieved
 
     for batch in tqdm(batches, desc="Batch querying NCBI.Entrez", disable=disable_prg_bar):
+        epost_webenv, epost_query_key, success = post_accessions_to_entrez(batch)
 
-        # POST IDS
-        try:
-            epost_webenv, epost_query_key = bulk_query_ncbi(batch, args)
-        except RuntimeError:
-            logger.warning(
-                "Runtime error raised when batch quering\n"
-                "Possible result of a accessions not being in NCBI\n"
-                "Attempt identification of the causal accession later\n"
-            )
-
-            failed_queries.append(batch)
+        if success == "Invalid ID":
+            # invalid ID, cache, and do not query again
+            invalid_ids.add(batch[0])
+            with open(invalid_ids_cache, "a") as fh:
+                fh.write(f"{batch[0]}\n")
             continue
 
-        if epost_webenv is None:
-            failed_queries.append(batch)
+        elif success == "Contains invalid ID":
+            # batch contains one or more invalid IDs.
+            # Don't know which are invalid so will try later
+            for acc in batch:
+                batches_with_invalid_ids.append(acc)
             continue
 
-        # Retrieve seqs
-        try:
-            with entrez_retry(
-                args.retries,
-                Entrez.efetch,
-                db="Protein",
-                query_key=epost_query_key,
-                WebEnv=epost_webenv,
-                rettype="fasta",
-                retmode="text",
-            ) as seq_handle:
-                for record in SeqIO.parse(seq_handle, "fasta"):
-                    temp_accession = record.id
-
-                    # check if multiple items returned in ID
-                    try:
-                        retrieved_accession = [_ for _ in record.id.split("|") if _.strip() in gbk_acc_to_retrieve][0]
-                    except IndexError:
-                        # try re search for accession in string
-                        try:
-                            retrieved_accession = re.match(r"\D{3}\d+\.\d+", record.id).group()
-                        except AttributeError:
-                            try:
-                                retrieved_accession = re.match(r"\D{2}_\d+\.\d+", record.id).group()
-                            except AttributeError:
-                                logger.error(
-                                    "Could not retrieve a GenBank protein accession matching "
-                                    "an accession from the local database from:\n"
-                                    f"{record.id}\n"
-                                    "The sequence from this record will not be added to the db"
-                                )
-                                irregular_accessions.append(temp_accession)
-                                continue
-
-                    if retrieved_accession not in success_accessions:
-                        seq_records.append(record)
-                        success_accessions.add(retrieved_accession)
-
-        except IncompleteRead as err:
-            logger.warning(
-                "IncompleteRead error raised:\n"
-                f"{err}\n"
-                "Will reattempt NCBI query later"
-            )
-
-            failed_queries.append(batch)
+        elif success == "Failed connection":
+            # Connection failed so retry later
+            failed_connections_batches.append(batch)
+            continue
+        
+        # else was a success, so proceed to retrieving protein seqs
+        seq_records, success, temp_successful_accessions = fetch_ncbi_seqs(seq_records, epost_webenv, epost_query_key, gbk_acc_to_retrieve)
+        
+        if success == "Invalid ID":
+            invalid_ids.add(batch[0])
+            with open(invalid_ids_cache, "a") as fh:
+                fh.write(f"{batch[0]}\n")
             continue
 
-        except Exception as err:
-            logger.warning(
-                "Error raised:\n"
-                f"{err}\n"
-                "Will reattempt NCBI query later"
-            )
+        elif success == "Contains invalid ID":
+            for acc in batch:
+                batches_with_invalid_ids.append(acc)
+            continue
 
-            failed_queries.append(batch)
+        elif success == "Failed connection":
+            failed_connections_batches.append(batch)
             continue
 
-        # cache the currently retrieved seqs
         SeqIO.write(seq_records, cache_path, "fasta")
+        successful_accessions += temp_successful_accessions
 
-    # list of GenBank accessions for which no protein sequence was retrieved
+    accs_still_to_fetch = [acc for acc in gbk_acc_to_retrieve if ((acc not in invalid_ids) or (acc not in successful_accessions))]
 
-    return seq_records, failed_queries
+    return seq_records, batches_with_invalid_ids, failed_connections_batches, accs_still_to_fetch
+        
 
+def post_accessions_to_entrez(batch):
+    """Post NCBI protein accessions to NCBI via Entrez, and capture error message if one returned.
 
-def bulk_query_ncbi(accessions, args):
-    """Bulk query NCBI and retrieve webenvironment and history tags
+    :param batch: list of genbank accessions
+    :param entrez_function: Entrez function (epost or efetch)
 
-    :param accessions: list of GenBank protein accessions
-    :param args: cmd-line args parser
-
-    Return webenv and query key
+    Return Entrez ePost web environment and query key.
     """
     logger = logging.getLogger(__name__)
-
-    # perform batch query of Entrez
-    try:
-        accessions_string = ",".join(accessions)
-    except TypeError:
-        accessions_string = accessions
-
-    # Runtime error captured by try/except function call
+    success, epost_result = None, None
+    
     try:
         epost_result = Entrez.read(
             entrez_retry(
                 args.retries,
                 Entrez.epost,
                 db="Protein",
-                id=accessions_string,
+                id=",".join(batch),
             ),
             validate=False,
         )
-    except NotXMLError:
-        logger.error("Could not parse Entrez output")
-        return None, None
-    except IncompleteRead:
-        logger.error(
-            "IncompleteRead error raised when posting protein record(s)\n"
-            "Will try again later"
+
+    except RuntimeError as err:
+        if repr(err).startswith("RuntimeError('Some IDs have invalid value and were omitted.") or repr(err).startswith("RuntimeError('Empty ID list; Nothing to store')"):
+            
+            if len(batch) == 1:
+                logger.warning(
+                    f"Accession '{batch[0]}' not listed in NCBI. Querying NCBI returns the error:\n"
+                    f"{repr(err)}\n"
+                    f"Not retrieving seq for '{batch[0]}'"
+                )
+                success = "Invalid ID"
+            
+            else:
+                logger.warning(
+                    "Batch contains an accession no longer listed in NCBI\n"
+                    f"Querying NCBI returns the error:\n{err}\n"
+                    "Will identify invalid ID(s) later"
+                )
+                success = "Contains invalid ID"
+
+        else:  # unrecognised Runtime error
+            if len(batch) == 1:
+                logger.warning(
+                    f"Runtime error occured when querying NCBI by accession '{batch[0]}'\n"
+                    f"Error returned:\n{err}\n"
+                    "Interrupting error as recognition of an invalid ID. Therefore,\n"
+                    f"not adding seq for '{batch[0]} to the local CAZyme db'"
+                )
+                success = "Invalid ID"
+
+            else:
+                logger.warning(
+                    f"Runtime error occured when querying NCBI with batch of gbk accessions\n"
+                    f"Error returned:\n{err}\n"
+                    "Interrupting error as batch containing invalid ID.\n"
+                    "Will identify invalid ID(s) later"
+                )
+                success = "Contains invalid ID"
+
+    except IncompleteRead as err:
+        logger.warning(
+            "IncompleteRead error raised when querying NCBI:\n"
+            f"{err}\n"
+            "Will reattempt NCBI query later"
+        )
+        success = "Failed connection"
+
+    except NotXMLError as err:
+        logger.warning(
+            "NotXMLError raised when querying NCBI:\n"
+            f"{err}\n"
+            "Will reattempt NCBI query later"
         )
-        return None, None
+        success = "Failed connection"
+
+    except (TypeError, AttributeError) as err:  # if no record is returned from call to Entrez
+        logger.warning(
+            f"Error occurenced when batch quering NCBI\n"
+            "Error retrieved:\n"
+            f"{repr(err)}\n"
+            "Will retry batch later"
+        )
+        success = "Failed connection"
+
+    except Exception as err:  # if no record is returned from call to Entrez
+        logger.warning(
+            f"Error occurenced when batch quering NCBI\n"
+            "Error retrieved:\n"
+            f"{repr(err)}\n"
+            "Will retry batch later"
+        )
+        success = "Failed connection"
 
     # retrieve the web environment and query key from the Entrez post
-    epost_webenv = epost_result["WebEnv"]
-    epost_query_key = epost_result["QueryKey"]
+    try:
+        epost_webenv = epost_result["WebEnv"]
+        epost_query_key = epost_result["QueryKey"]
+        success = "Complete"
+    except (TypeError, AttributeError):
+        epost_webenv, epost_query_key = None, None
+        pass  # raised when could not epost failed
+
+    return epost_webenv, epost_query_key, success
 
-    return epost_webenv, epost_query_key
+
+def fetch_ncbi_seqs(seq_records, epost_webenv, epost_query_key, acc_to_retrieve):
+    """Retrieve protein sequences from NCBI from ePost of protein v.accs.
+
+    :param seq_records: list of Bio.SeqRecords
+    :param epost_websenv: Entrez ePost webenvironment
+    :param epost_query_key: Entrez ePost query key
+    :param acc_to_retrieve: set of NCBI protein version accessions to retrieve seqs for
+
+    Return updated list of SeqRecords and string marking success/failure or seq retrieval.
+    """
+    logger = logging.getLogger(__name__)
+    success, successful_accessions = None, []
+
+    try:
+        with entrez_retry(
+            args.retries,
+            Entrez.efetch,
+            db="Protein",
+            query_key=epost_query_key,
+            WebEnv=epost_webenv,
+            rettype="fasta",
+            retmode="text",
+        ) as seq_handle:
+            for record in SeqIO.parse(seq_handle, "fasta"):
+                retrieved_accession = None
+
+                # check if multiple items returned in ID
+                try:
+                    retrieved_accession = [_ for _ in record.id.split("|") if _.strip() in gbk_acc_to_retrieve][0]
+                except IndexError:
+                    # try re search for accession in string
+                    try:
+                        retrieved_accession = re.match(r"\D{3}\d+\.\d+", record.id).group()
+                    except AttributeError:
+                        try:
+                            retrieved_accession = re.match(r"\D{2}_\d+\.\d+", record.id).group()
+                        except AttributeError:
+                            logger.warning(
+                                f"Could not fetch protein acc from record id '{record.id}'.\n"
+                                "Will search all target accessions against record id"
+                            )
+                            for acc in acc_to_retrieve:
+                                if record.id.find(acc) != -1:
+                                    retrieved_accession = acc
+                
+                if retrieved_accession is None:
+                    logger.error(
+                        "Could not retrieve a NCBI protein version accession matching\n"
+                        f"an accession from the local database from the record id '{record.id}'\n"
+                        "The sequence from this record will not be added to the db"
+                    )
+                    continue
+
+                seq_records.append(record)
+                successful_accessions.add(retrieved_accession)
+
+    except RuntimeError as err:
+        if repr(err).startswith("RuntimeError('Some IDs have invalid value and were omitted.") or repr(err).startswith("RuntimeError('Empty ID list; Nothing to store')"):
+            
+            if len(batch) == 1:
+                logger.warning(
+                    f"Accession '{batch[0]}' not listed in NCBI. Querying NCBI returns the error:\n"
+                    f"{repr(err)}\n"
+                    f"Not retrieving seq for '{batch[0]}'"
+                )
+                success = "Invalid ID"
+            
+            else:
+                logger.warning(
+                    "Batch contains an accession no longer listed in NCBI\n"
+                    f"Querying NCBI returns the error:\n{err}\n"
+                    "Will identify invalid ID(s) later"
+                )
+                success = "Contains invalid ID"
+
+        else:  # unrecognised Runtime error
+            if len(batch) == 1:
+                logger.warning(
+                    f"Runtime error occured when fetching record from NCBI for accession '{batch[0]}'\n"
+                    f"Error returned:\n{err}\n"
+                    "Interrupting error as recognition of an invalid ID. Therefore,\n"
+                    f"not adding seq for '{batch[0]} to the local CAZyme db'"
+                )
+                success = "Invalid ID"
+
+            else:
+                logger.warning(
+                    f"Runtime error occured when fetching records from NCBI\n"
+                    f"Error returned:\n{err}\n"
+                    "Interrupting error as batch containing invalid ID.\n"
+                    "Will identify invalid ID(s) later"
+                )
+                success = "Contains invalid ID"
+
+    except IncompleteRead as err:
+        logger.warning(
+            "IncompleteRead error raised when fetching record from NCBI:\n"
+            f"{err}\n"
+            "Will reattempt NCBI query later"
+        )
+        success = "Failed connection"
+
+    except NotXMLError as err:
+        logger.warning(
+            "NotXMLError raised when fetching record(s) from NCBI:\n"
+            f"{err}\n"
+            "Will reattempt NCBI query later"
+        )
+        success = "Failed connection"
+
+    except (TypeError, AttributeError) as err:  # if no record is returned from call to Entrez
+        logger.warning(
+            f"Error occurenced when fetching records from NCBI\n"
+            "Error retrieved:\n"
+            f"{repr(err)}\n"
+            "Will retry batch later"
+        )
+        success = "Failed connection"
+
+    except Exception as err:  # if no record is returned from call to Entrez
+        logger.warning(
+            f"Error occurenced when batch quering NCBI\n"
+            "Error retrieved:\n"
+            f"{repr(err)}\n"
+            "Will retry batch later"
+        )
+        success = "Failed connection"
+
+    return seq_records, success, success_accessions
+
+
+def parse_failed_connections(
+    failed_connections_batches,
+    seq_records,
+    accs_to_fetch,
+    cache_path,
+    invalid_ids_cache, 
+    args,
+):
+    """Parse batches that suffered failed connections on the first attempt.
+
+    :param failed_connections_batches: list of batches
+    :param seq_records: list of Bio.SeqRecords
+    :param accs_to_fetch: list of NCBI protein accs to retrieve seqs for
+    :param cache_path: path to cache downloaded seqs
+    :param invalid_ids_cache: path to cache invalid IDs
+    :param args: CLI args parser
+
+    Return 
+    * updated list of seq_records
+    * batches with invalid IDs
+    * accs still to fetch seqs for
+    """
+    
 
 
 if __name__ == "__main__":

From ec3d6d44dfce8fa9880ba9d7085098d5326e8794 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 6 Jan 2023 15:01:06 +0000
Subject: [PATCH 49/67] move ncbi-calling funcs to ncbi mod

move the functions that perform the called to NCBI.Entrez to the NCBI module
---
 .../sequences/get_genbank_sequences.py        | 325 +++++-------------
 cazy_webscraper/ncbi/sequences/__init__.py    | 284 +++++++++++++++
 2 files changed, 365 insertions(+), 244 deletions(-)
 create mode 100644 cazy_webscraper/ncbi/sequences/__init__.py

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index aec0dce1..b2503e0f 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -360,7 +360,7 @@ def get_seqs_from_ncbi(
         accs_still_to_fetch,
     ) = get_seqs(batches, cache_path, invalid_ids_cache, args)
 
-    if len(failed_connections_batches) > 0:
+    if len(list(failed_connections_batches.keys())) > 0:
         # Retry failed batches before parsing invalid IDs as the failed connections
         # may contain invalid IDs
         (
@@ -371,7 +371,9 @@ def get_seqs_from_ncbi(
             failed_connections_batches,
             seq_records,
             accs_still_to_fetch,
+            cache_dir,
             cache_path,
+            batches_with_invalid_ids.
             invalid_ids_cache, 
             args,
         )
@@ -530,7 +532,7 @@ def get_seqs(batches, cache_path, invalid_ids_cache, args, disable_prg_bar=False
 
     invalid_ids = set()
     batches_with_invalid_ids = []  # nested lists of batches with invalid IDs
-    failed_connections_batches = []  # nested list of batches during the querying of which the connection failed
+    failed_connections_batches = {}  # dict of batches during the querying of which the connection failed
 
     seq_records = []
 
@@ -555,7 +557,10 @@ def get_seqs(batches, cache_path, invalid_ids_cache, args, disable_prg_bar=False
 
         elif success == "Failed connection":
             # Connection failed so retry later
-            failed_connections_batches.append(batch)
+            failed_connections_batches[str(batch)] = {
+                'batch': batch,
+                '#_of_attempts': 1 
+            }
             continue
         
         # else was a success, so proceed to retrieving protein seqs
@@ -573,7 +578,10 @@ def get_seqs(batches, cache_path, invalid_ids_cache, args, disable_prg_bar=False
             continue
 
         elif success == "Failed connection":
-            failed_connections_batches.append(batch)
+            failed_connections_batches[str(batch)] = {
+                'batch': batch,
+                '#_of_attempts': 1 
+            }
             continue
 
         SeqIO.write(seq_records, cache_path, "fasta")
@@ -582,268 +590,97 @@ def get_seqs(batches, cache_path, invalid_ids_cache, args, disable_prg_bar=False
     accs_still_to_fetch = [acc for acc in gbk_acc_to_retrieve if ((acc not in invalid_ids) or (acc not in successful_accessions))]
 
     return seq_records, batches_with_invalid_ids, failed_connections_batches, accs_still_to_fetch
-        
-
-def post_accessions_to_entrez(batch):
-    """Post NCBI protein accessions to NCBI via Entrez, and capture error message if one returned.
-
-    :param batch: list of genbank accessions
-    :param entrez_function: Entrez function (epost or efetch)
-
-    Return Entrez ePost web environment and query key.
-    """
-    logger = logging.getLogger(__name__)
-    success, epost_result = None, None
-    
-    try:
-        epost_result = Entrez.read(
-            entrez_retry(
-                args.retries,
-                Entrez.epost,
-                db="Protein",
-                id=",".join(batch),
-            ),
-            validate=False,
-        )
-
-    except RuntimeError as err:
-        if repr(err).startswith("RuntimeError('Some IDs have invalid value and were omitted.") or repr(err).startswith("RuntimeError('Empty ID list; Nothing to store')"):
-            
-            if len(batch) == 1:
-                logger.warning(
-                    f"Accession '{batch[0]}' not listed in NCBI. Querying NCBI returns the error:\n"
-                    f"{repr(err)}\n"
-                    f"Not retrieving seq for '{batch[0]}'"
-                )
-                success = "Invalid ID"
-            
-            else:
-                logger.warning(
-                    "Batch contains an accession no longer listed in NCBI\n"
-                    f"Querying NCBI returns the error:\n{err}\n"
-                    "Will identify invalid ID(s) later"
-                )
-                success = "Contains invalid ID"
-
-        else:  # unrecognised Runtime error
-            if len(batch) == 1:
-                logger.warning(
-                    f"Runtime error occured when querying NCBI by accession '{batch[0]}'\n"
-                    f"Error returned:\n{err}\n"
-                    "Interrupting error as recognition of an invalid ID. Therefore,\n"
-                    f"not adding seq for '{batch[0]} to the local CAZyme db'"
-                )
-                success = "Invalid ID"
-
-            else:
-                logger.warning(
-                    f"Runtime error occured when querying NCBI with batch of gbk accessions\n"
-                    f"Error returned:\n{err}\n"
-                    "Interrupting error as batch containing invalid ID.\n"
-                    "Will identify invalid ID(s) later"
-                )
-                success = "Contains invalid ID"
-
-    except IncompleteRead as err:
-        logger.warning(
-            "IncompleteRead error raised when querying NCBI:\n"
-            f"{err}\n"
-            "Will reattempt NCBI query later"
-        )
-        success = "Failed connection"
-
-    except NotXMLError as err:
-        logger.warning(
-            "NotXMLError raised when querying NCBI:\n"
-            f"{err}\n"
-            "Will reattempt NCBI query later"
-        )
-        success = "Failed connection"
-
-    except (TypeError, AttributeError) as err:  # if no record is returned from call to Entrez
-        logger.warning(
-            f"Error occurenced when batch quering NCBI\n"
-            "Error retrieved:\n"
-            f"{repr(err)}\n"
-            "Will retry batch later"
-        )
-        success = "Failed connection"
-
-    except Exception as err:  # if no record is returned from call to Entrez
-        logger.warning(
-            f"Error occurenced when batch quering NCBI\n"
-            "Error retrieved:\n"
-            f"{repr(err)}\n"
-            "Will retry batch later"
-        )
-        success = "Failed connection"
-
-    # retrieve the web environment and query key from the Entrez post
-    try:
-        epost_webenv = epost_result["WebEnv"]
-        epost_query_key = epost_result["QueryKey"]
-        success = "Complete"
-    except (TypeError, AttributeError):
-        epost_webenv, epost_query_key = None, None
-        pass  # raised when could not epost failed
-
-    return epost_webenv, epost_query_key, success
-
-
-def fetch_ncbi_seqs(seq_records, epost_webenv, epost_query_key, acc_to_retrieve):
-    """Retrieve protein sequences from NCBI from ePost of protein v.accs.
-
-    :param seq_records: list of Bio.SeqRecords
-    :param epost_websenv: Entrez ePost webenvironment
-    :param epost_query_key: Entrez ePost query key
-    :param acc_to_retrieve: set of NCBI protein version accessions to retrieve seqs for
-
-    Return updated list of SeqRecords and string marking success/failure or seq retrieval.
-    """
-    logger = logging.getLogger(__name__)
-    success, successful_accessions = None, []
-
-    try:
-        with entrez_retry(
-            args.retries,
-            Entrez.efetch,
-            db="Protein",
-            query_key=epost_query_key,
-            WebEnv=epost_webenv,
-            rettype="fasta",
-            retmode="text",
-        ) as seq_handle:
-            for record in SeqIO.parse(seq_handle, "fasta"):
-                retrieved_accession = None
-
-                # check if multiple items returned in ID
-                try:
-                    retrieved_accession = [_ for _ in record.id.split("|") if _.strip() in gbk_acc_to_retrieve][0]
-                except IndexError:
-                    # try re search for accession in string
-                    try:
-                        retrieved_accession = re.match(r"\D{3}\d+\.\d+", record.id).group()
-                    except AttributeError:
-                        try:
-                            retrieved_accession = re.match(r"\D{2}_\d+\.\d+", record.id).group()
-                        except AttributeError:
-                            logger.warning(
-                                f"Could not fetch protein acc from record id '{record.id}'.\n"
-                                "Will search all target accessions against record id"
-                            )
-                            for acc in acc_to_retrieve:
-                                if record.id.find(acc) != -1:
-                                    retrieved_accession = acc
-                
-                if retrieved_accession is None:
-                    logger.error(
-                        "Could not retrieve a NCBI protein version accession matching\n"
-                        f"an accession from the local database from the record id '{record.id}'\n"
-                        "The sequence from this record will not be added to the db"
-                    )
-                    continue
-
-                seq_records.append(record)
-                successful_accessions.add(retrieved_accession)
-
-    except RuntimeError as err:
-        if repr(err).startswith("RuntimeError('Some IDs have invalid value and were omitted.") or repr(err).startswith("RuntimeError('Empty ID list; Nothing to store')"):
-            
-            if len(batch) == 1:
-                logger.warning(
-                    f"Accession '{batch[0]}' not listed in NCBI. Querying NCBI returns the error:\n"
-                    f"{repr(err)}\n"
-                    f"Not retrieving seq for '{batch[0]}'"
-                )
-                success = "Invalid ID"
-            
-            else:
-                logger.warning(
-                    "Batch contains an accession no longer listed in NCBI\n"
-                    f"Querying NCBI returns the error:\n{err}\n"
-                    "Will identify invalid ID(s) later"
-                )
-                success = "Contains invalid ID"
-
-        else:  # unrecognised Runtime error
-            if len(batch) == 1:
-                logger.warning(
-                    f"Runtime error occured when fetching record from NCBI for accession '{batch[0]}'\n"
-                    f"Error returned:\n{err}\n"
-                    "Interrupting error as recognition of an invalid ID. Therefore,\n"
-                    f"not adding seq for '{batch[0]} to the local CAZyme db'"
-                )
-                success = "Invalid ID"
-
-            else:
-                logger.warning(
-                    f"Runtime error occured when fetching records from NCBI\n"
-                    f"Error returned:\n{err}\n"
-                    "Interrupting error as batch containing invalid ID.\n"
-                    "Will identify invalid ID(s) later"
-                )
-                success = "Contains invalid ID"
-
-    except IncompleteRead as err:
-        logger.warning(
-            "IncompleteRead error raised when fetching record from NCBI:\n"
-            f"{err}\n"
-            "Will reattempt NCBI query later"
-        )
-        success = "Failed connection"
-
-    except NotXMLError as err:
-        logger.warning(
-            "NotXMLError raised when fetching record(s) from NCBI:\n"
-            f"{err}\n"
-            "Will reattempt NCBI query later"
-        )
-        success = "Failed connection"
-
-    except (TypeError, AttributeError) as err:  # if no record is returned from call to Entrez
-        logger.warning(
-            f"Error occurenced when fetching records from NCBI\n"
-            "Error retrieved:\n"
-            f"{repr(err)}\n"
-            "Will retry batch later"
-        )
-        success = "Failed connection"
-
-    except Exception as err:  # if no record is returned from call to Entrez
-        logger.warning(
-            f"Error occurenced when batch quering NCBI\n"
-            "Error retrieved:\n"
-            f"{repr(err)}\n"
-            "Will retry batch later"
-        )
-        success = "Failed connection"
-
-    return seq_records, success, success_accessions
 
 
 def parse_failed_connections(
     failed_connections_batches,
     seq_records,
     accs_to_fetch,
+    cache_dir,
     cache_path,
+    batches_with_invalid_ids,
     invalid_ids_cache, 
     args,
 ):
     """Parse batches that suffered failed connections on the first attempt.
 
-    :param failed_connections_batches: list of batches
+    :param failed_connections_batches: dict of batches, key str(batch): {'batch': [], '#_of_attempts': int}
     :param seq_records: list of Bio.SeqRecords
     :param accs_to_fetch: list of NCBI protein accs to retrieve seqs for
     :param cache_path: path to cache downloaded seqs
+    :param batches_with_invalid_ids: list of batches containing invalid IDs
     :param invalid_ids_cache: path to cache invalid IDs
     :param args: CLI args parser
 
     Return 
     * updated list of seq_records
     * batches with invalid IDs
-    * accs still to fetch seqs for
     """
-    
+    logger = logging.getLogger(__name__)
+    failed_connection_cache = cache_dir / "failed_entrez_connection_accessions"
+    failed_batches = [failed_connections_batches[batch_name]['batch'] for batch_name in failed_connections_batches]
+
+    while (len(list(failed_connections_batches.keys())) != 0:
+        # remove processed batches from failed_batches
+        for batch in failed_batches:
+            try:
+                failed_connections_batches[str(batch)]
+            except KeyError:
+                # batch has been processed and no longer in failed_connections
+                # delete batch from list
+                failed_batches.remove(batch)
+
+        if len(failed_batches) > 0:
+            logger.warning(
+                f"Retrying connection for {len(failed_batches)} remaining batches suffering failed connections"
+            )
+        
+            # batch query failed batches
+            (
+                new_seq_records,
+                new_batches_with_invalid_ids,
+                new_failed_connections_batches,
+                new_accs_still_to_fetch,
+            ) = get_seqs(
+                failed_batches,
+                cache_path,
+                invalid_ids_cache,
+                args,
+            )
+
+            seq_records += new_seq_records
+            # remove successfully processed batches
+            for batch in failed_batches:
+                if (batch not in new_batches_with_invalid_ids) and (batch not in list(new_failed_connections_batches.keys())):
+                    # not in invalid IDs or new failed batches, presume batch was processed
+                    failed_batches.remove(batch)
+                    del failed_connections_batches[str(batch)]
+
+            batches_with_invalid_ids += new_batches_with_invalid_ids
+            # remove batches with invalid IDs, so don't retry connection
+            for batch in batches_with_invalid_ids:
+                failed_connections_batches[str(batch)]
+                failed_batches.remove(batch)
+
+            # remove batches that were processed successfully and 
+            # increate attempt count for batches whose connections failed
+            for batch in new_failed_connections_batches:
+                failed_connections_batches[str(batch)]
+
+                if failed_connections_batches[str(failed_connections_batches)]['#_of_attempts'] >= args.retries:
+                    logger.error(
+                        "Ran out of connection attempts for the following batch:\n"
+                        f"{batch}\n"
+                        "Will not add seqs for these proteins to the local CAZyme database."
+                    )
+                    with open(failed_connection_cache, "a") as fh:
+                        for protein_acc in batch:
+                            fh.write(f"{protein_acc}\n")
+                    del failed_connections_batches[str(batch)]
+
+    return seq_records, batches_with_invalid_ids
+                    
+
 
 
 if __name__ == "__main__":
diff --git a/cazy_webscraper/ncbi/sequences/__init__.py b/cazy_webscraper/ncbi/sequences/__init__.py
new file mode 100644
index 00000000..1f56f575
--- /dev/null
+++ b/cazy_webscraper/ncbi/sequences/__init__.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# (c) University of St Andrews 2022
+# (c) University of Strathclyde 2022
+# Author:
+# Emma E. M. Hobbs
+
+# Contact
+# eemh1@st-andrews.ac.uk
+
+# Emma E. M. Hobbs,
+# Biomolecular Sciences Building,
+# University of St Andrews,
+# North Haugh Campus,
+# St Andrews,
+# KY16 9ST
+# Scotland,
+# UK
+
+# The MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Get the protein records for a provided list of gene names"""
+
+
+import logging
+
+from Bio import Entrez
+from saintBioutils.genbank import entrez_retry
+from saintBioutils.misc import get_chunks_list
+from tqdm import tqdm
+ 
+
+def post_accessions_to_entrez(batch):
+    """Post NCBI protein accessions to NCBI via Entrez, and capture error message if one returned.
+
+    :param batch: list of genbank accessions
+    :param entrez_function: Entrez function (epost or efetch)
+
+    Return Entrez ePost web environment and query key.
+    """
+    logger = logging.getLogger(__name__)
+    success, epost_result = None, None
+    
+    try:
+        epost_result = Entrez.read(
+            entrez_retry(
+                args.retries,
+                Entrez.epost,
+                db="Protein",
+                id=",".join(batch),
+            ),
+            validate=False,
+        )
+
+    except RuntimeError as err:
+        if repr(err).startswith("RuntimeError('Some IDs have invalid value and were omitted.") or repr(err).startswith("RuntimeError('Empty ID list; Nothing to store')"):
+            
+            if len(batch) == 1:
+                logger.warning(
+                    f"Accession '{batch[0]}' not listed in NCBI. Querying NCBI returns the error:\n"
+                    f"{repr(err)}\n"
+                    f"Not retrieving seq for '{batch[0]}'"
+                )
+                success = "Invalid ID"
+            
+            else:
+                logger.warning(
+                    "Batch contains an accession no longer listed in NCBI\n"
+                    f"Querying NCBI returns the error:\n{err}\n"
+                    "Will identify invalid ID(s) later"
+                )
+                success = "Contains invalid ID"
+
+        else:  # unrecognised Runtime error
+            if len(batch) == 1:
+                logger.warning(
+                    f"Runtime error occured when querying NCBI by accession '{batch[0]}'\n"
+                    f"Error returned:\n{err}\n"
+                    "Interrupting error as recognition of an invalid ID. Therefore,\n"
+                    f"not adding seq for '{batch[0]} to the local CAZyme db'"
+                )
+                success = "Invalid ID"
+
+            else:
+                logger.warning(
+                    f"Runtime error occured when querying NCBI with batch of gbk accessions\n"
+                    f"Error returned:\n{err}\n"
+                    "Interrupting error as batch containing invalid ID.\n"
+                    "Will identify invalid ID(s) later"
+                )
+                success = "Contains invalid ID"
+
+    except IncompleteRead as err:
+        logger.warning(
+            "IncompleteRead error raised when querying NCBI:\n"
+            f"{err}\n"
+            "Will reattempt NCBI query later"
+        )
+        success = "Failed connection"
+
+    except NotXMLError as err:
+        logger.warning(
+            "NotXMLError raised when querying NCBI:\n"
+            f"{err}\n"
+            "Will reattempt NCBI query later"
+        )
+        success = "Failed connection"
+
+    except (TypeError, AttributeError) as err:  # if no record is returned from call to Entrez
+        logger.warning(
+            f"Error occurenced when batch quering NCBI\n"
+            "Error retrieved:\n"
+            f"{repr(err)}\n"
+            "Will retry batch later"
+        )
+        success = "Failed connection"
+
+    except Exception as err:  # if no record is returned from call to Entrez
+        logger.warning(
+            f"Error occurenced when batch quering NCBI\n"
+            "Error retrieved:\n"
+            f"{repr(err)}\n"
+            "Will retry batch later"
+        )
+        success = "Failed connection"
+
+    # retrieve the web environment and query key from the Entrez post
+    try:
+        epost_webenv = epost_result["WebEnv"]
+        epost_query_key = epost_result["QueryKey"]
+        success = "Complete"
+    except (TypeError, AttributeError):
+        epost_webenv, epost_query_key = None, None
+        pass  # raised when could not epost failed
+
+    return epost_webenv, epost_query_key, success
+
+
+def fetch_ncbi_seqs(seq_records, epost_webenv, epost_query_key, acc_to_retrieve):
+    """Retrieve protein sequences from NCBI from ePost of protein v.accs.
+
+    :param seq_records: list of Bio.SeqRecords
+    :param epost_websenv: Entrez ePost webenvironment
+    :param epost_query_key: Entrez ePost query key
+    :param acc_to_retrieve: set of NCBI protein version accessions to retrieve seqs for
+
+    Return updated list of SeqRecords and string marking success/failure or seq retrieval.
+    """
+    logger = logging.getLogger(__name__)
+    success, successful_accessions = None, []
+
+    try:
+        with entrez_retry(
+            args.retries,
+            Entrez.efetch,
+            db="Protein",
+            query_key=epost_query_key,
+            WebEnv=epost_webenv,
+            rettype="fasta",
+            retmode="text",
+        ) as seq_handle:
+            for record in SeqIO.parse(seq_handle, "fasta"):
+                retrieved_accession = None
+
+                # check if multiple items returned in ID
+                try:
+                    retrieved_accession = [_ for _ in record.id.split("|") if _.strip() in gbk_acc_to_retrieve][0]
+                except IndexError:
+                    # try re search for accession in string
+                    try:
+                        retrieved_accession = re.match(r"\D{3}\d+\.\d+", record.id).group()
+                    except AttributeError:
+                        try:
+                            retrieved_accession = re.match(r"\D{2}_\d+\.\d+", record.id).group()
+                        except AttributeError:
+                            logger.warning(
+                                f"Could not fetch protein acc from record id '{record.id}'.\n"
+                                "Will search all target accessions against record id"
+                            )
+                            for acc in acc_to_retrieve:
+                                if record.id.find(acc) != -1:
+                                    retrieved_accession = acc
+                
+                if retrieved_accession is None:
+                    logger.error(
+                        "Could not retrieve a NCBI protein version accession matching\n"
+                        f"an accession from the local database from the record id '{record.id}'\n"
+                        "The sequence from this record will not be added to the db"
+                    )
+                    continue
+
+                seq_records.append(record)
+                successful_accessions.add(retrieved_accession)
+
+    except RuntimeError as err:
+        if repr(err).startswith("RuntimeError('Some IDs have invalid value and were omitted.") or repr(err).startswith("RuntimeError('Empty ID list; Nothing to store')"):
+            
+            if len(batch) == 1:
+                logger.warning(
+                    f"Accession '{batch[0]}' not listed in NCBI. Querying NCBI returns the error:\n"
+                    f"{repr(err)}\n"
+                    f"Not retrieving seq for '{batch[0]}'"
+                )
+                success = "Invalid ID"
+            
+            else:
+                logger.warning(
+                    "Batch contains an accession no longer listed in NCBI\n"
+                    f"Querying NCBI returns the error:\n{err}\n"
+                    "Will identify invalid ID(s) later"
+                )
+                success = "Contains invalid ID"
+
+        else:  # unrecognised Runtime error
+            if len(batch) == 1:
+                logger.warning(
+                    f"Runtime error occured when fetching record from NCBI for accession '{batch[0]}'\n"
+                    f"Error returned:\n{err}\n"
+                    "Interrupting error as recognition of an invalid ID. Therefore,\n"
+                    f"not adding seq for '{batch[0]} to the local CAZyme db'"
+                )
+                success = "Invalid ID"
+
+            else:
+                logger.warning(
+                    f"Runtime error occured when fetching records from NCBI\n"
+                    f"Error returned:\n{err}\n"
+                    "Interrupting error as batch containing invalid ID.\n"
+                    "Will identify invalid ID(s) later"
+                )
+                success = "Contains invalid ID"
+
+    except IncompleteRead as err:
+        logger.warning(
+            "IncompleteRead error raised when fetching record from NCBI:\n"
+            f"{err}\n"
+            "Will reattempt NCBI query later"
+        )
+        success = "Failed connection"
+
+    except NotXMLError as err:
+        logger.warning(
+            "NotXMLError raised when fetching record(s) from NCBI:\n"
+            f"{err}\n"
+            "Will reattempt NCBI query later"
+        )
+        success = "Failed connection"
+
+    except (TypeError, AttributeError) as err:  # if no record is returned from call to Entrez
+        logger.warning(
+            f"Error occurenced when fetching records from NCBI\n"
+            "Error retrieved:\n"
+            f"{repr(err)}\n"
+            "Will retry batch later"
+        )
+        success = "Failed connection"
+
+    except Exception as err:  # if no record is returned from call to Entrez
+        logger.warning(
+            f"Error occurenced when batch quering NCBI\n"
+            "Error retrieved:\n"
+            f"{repr(err)}\n"
+            "Will retry batch later"
+        )
+        success = "Failed connection"
+
+    return seq_records, success, success_accessions

From 2d0822b49b6bb11168d48c60cef60c4567962750 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 6 Jan 2023 15:03:06 +0000
Subject: [PATCH 50/67] update imports for getting prot seqs

---
 .../expand/genbank/sequences/get_genbank_sequences.py         | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index b2503e0f..a353e624 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -60,6 +60,7 @@
 from tqdm import tqdm
 
 from cazy_webscraper import closing_message, connect_existing_db
+from cazy_webscraper.ncbi.sequences import post_accessions_to_entrez, fetch_ncbi_seqs
 from cazy_webscraper.sql import sql_orm, sql_interface
 from cazy_webscraper.sql.sql_interface.get_data import get_selected_gbks
 from cazy_webscraper.sql.sql_interface.add_data import add_genbank_data
@@ -393,9 +394,6 @@ def get_seqs_from_ncbi(
             args,
         )
 
-
-        
-
     if len(accs_still_to_fetch) > 0:
         logger.error(
             f"Could not fetch seqs for {len(accs_still_to_fetch)} NCBI protein accessions.\n"

From 75f34052f8143814d751948a50bd37f1781bd282 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 6 Jan 2023 16:24:53 +0000
Subject: [PATCH 51/67] process invalids containing batches after processing
 failed connection batches. Process each separetly

---
 .../sequences/get_genbank_sequences.py        | 183 ++++++++----------
 1 file changed, 86 insertions(+), 97 deletions(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index a353e624..68d1ae1c 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -394,16 +394,6 @@ def get_seqs_from_ncbi(
             args,
         )
 
-    if len(accs_still_to_fetch) > 0:
-        logger.error(
-            f"Could not fetch seqs for {len(accs_still_to_fetch)} NCBI protein accessions.\n"
-            "Caching these protein accessions"
-        )
-        failed_ids_cache = cache_dir / "failed_retrieve_ids"
-        with open(failed_ids_cache, "a") as fh:
-            for acc in accs_still_to_fetch:
-                fh.write(f"{acc}\n")
-
     # add seq records to seq dict
     for record in seq_records:
         try:
@@ -416,95 +406,26 @@ def get_seqs_from_ncbi(
         except KeyError:
             seq_dict[record.id] = record.seq
 
-    logger.warning(f"Retrieved {len(seq_records)} from NCBI")
-    
-    return seq_dict
-
-    # list of downloaded SeqRecords and list of gbk acc for which no seq was retrieved from NCBI
-    downloaded_seqs, failed_queries = get_sequences(all_queries, cache_dir, args)
-
-    # retry failed accs
-    no_seq_acc = []
-    if len(failed_queries) != 0:
-        logger.warning(
-            "Parsing accessions which could not retrieve a seq for the first time\n"
-            f"Handling {len(failed_queries)} failed batches"
-        )
-        # break up and query individually
-        retrying_acc = {}  # {acc: # of tries}
-        for batch in failed_queries:
-            for acc in batch:
-                retrying_acc[acc] = 0
-
-        finished_retry = False
-
-        acc_list = list(retrying_acc.keys())
-
-        no_seq_acc = set()  # set of accessions for which no seq could be retrieved
-
-        while finished_retry is False:
-            acc_list = list(retrying_acc.keys())
-
-            logger.warning(
-                "Handling failed protein accessions\n"
-                f"{len(acc_list)} protein accessions remaining"
-            )
-
-            for accession in tqdm(acc_list, desc="Handling failed accessions"):
-                new_seq, failed_seq = get_sequences([[accession]], cache_dir, args, disable_prg_bar=True)
-
-                if len(new_seq) == 0:
-                    logger.warning(
-                        f"Failed to retrieve sequence for {accession} on try no. {retrying_acc[accession]}"
-                    )
-                    retrying_acc[accession] += 1
-
-                    if retrying_acc[accession] >= args.retries:
-                        logger.warning(
-                            f"Could not retrieve sequence for {accession}\n"
-                            "Ran out of attempts"
-                        )
-                        del retrying_acc[accession]
-                        no_seq_acc.add(accession)
-
-            acc_list = list(retrying_acc.keys())
-
-            if len(acc_list) > 0:
-                finished_retry = True
-
-    # cache accs of proteins for which not seq could be retrieved from NCBI
-    if len(no_seq_acc) != 0:
-        no_seq_cache = cache_dir / f"no_seq_retrieved_{start_time}.txt"
-
-        logger.warning(
-            f"No protein sequence retrieved for {len(no_seq_acc)} proteins\n"
-            f"The GenBank accessions for these proteins have been written to: {no_seq_cache}"
-        )
-
+    accs_still_to_fetch = set()
+    for acc in accs_seqs_to_retrieve:
         try:
-            with open(no_seq_cache, "a") as fh:
-                for acc in no_seq_acc:
-                    fh.write(f"{acc}\n")
-        except FileNotFoundError:
-            logger.error(
-                "Could not cache acc of proteins for which not seqs were retrieved from NCBI to:\n"
-                f"{no_seq_cache}"
-            )
-
-    # only cache the sequence. Seq obj is not JSON serializable
-    cache_dict = {}
-    for key in seq_dict:
-        cache_dict[key] = str(seq_dict[key])
-
-    # cache the retrieved sequences
-    cache_path = cache_dir / f"genbank_seqs_{start_time}.json"
-    with open(cache_path, "w") as fh:
-        json.dump(cache_dict, fh)
+            seq_dict[acc]
+        except KeyError:
+            accs_still_to_fetch.add(acc)
 
-    for record in downloaded_seqs:
-        seq_dict[record.id] = record.seq
+    if len(accs_still_to_fetch) > 0:
+        logger.error(
+            f"Could not fetch seqs for {len(accs_still_to_fetch)} NCBI protein accessions.\n"
+            "Caching these protein accessions"
+        )
+        failed_ids_cache = cache_dir / "failed_retrieve_ids"
+        with open(failed_ids_cache, "a") as fh:
+            for acc in accs_still_to_fetch:
+                fh.write(f"{acc}\n")
 
-    return seq_dict, gbk_dict
+    logger.warning(f"Retrieved {len(seq_records)} from NCBI")
+    
+    return seq_dict
 
 
 def get_seqs(batches, cache_path, invalid_ids_cache, args, disable_prg_bar=False):
@@ -676,10 +597,78 @@ def parse_failed_connections(
                             fh.write(f"{protein_acc}\n")
                     del failed_connections_batches[str(batch)]
 
+                else:
+                    failed_connections_batches[str(failed_connections_batches)]['#_of_attempts'] += 1
+
     return seq_records, batches_with_invalid_ids
-                    
 
 
+def parse_invalid_id_batches(
+    seq_records,
+    batches_with_invalid_ids,
+    cache_path,
+    invalid_ids_cache,
+    args,
+):
+    """Separate accessions in batches containing invalid IDs. Retrieve seqs for valid IDs.
+
+    :param seq_records: list of Bio.SeqRecords
+    :param batches_with_invalid_ids: list of batches containing invalid IDs
+    :param cache_path: path to cache downloaded seqs
+    :param invalid_ids_cache: path to cache invalid IDs
+    :param args: CLI args parser
+
+    Return updated list of seq_records
+    """
+    logger = logging.getLogger(__name__)
+
+    # separaet accessions into individual batches to identify invalid IDs
+    batches = []
+    for batch in batches_with_invalid_ids:
+        for acc in batch:
+            batches.append([acc])
+
+    logger.warning(
+        "Separted accessions in batches containing invalid IDs.\n"
+        f"Processing {len(batches)} batches"
+    )
+
+    # inital parse
+    (
+        new_seq_records,
+        batches_with_invalid_ids,
+        failed_connections_batches,
+        accs_still_to_fetch,
+    ) = get_seqs(
+        batches,
+        cache_path,
+        invalid_ids_cache,
+        args,
+    )
+
+    seq_records += new_seq_records
+
+    if len(list(failed_connections_batches.keys())) > 0:
+        # Retry failed batches before parsing invalid IDs as the failed connections
+        # may contain invalid IDs
+        (
+            new_seq_records,
+            batches_with_invalid_ids,
+            accs_still_to_fetch,
+        ) = parse_failed_connections(
+            failed_connections_batches,
+            seq_records,
+            accs_still_to_fetch,
+            cache_dir,
+            cache_path,
+            batches_with_invalid_ids.
+            invalid_ids_cache, 
+            args,
+        )
+        seq_records += new_seq_records
+
+    return seq_records
+
 
 if __name__ == "__main__":
     main()

From a70c2b96c863cd400c3e6b5ca19165633fbb77d3 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 6 Jan 2023 16:56:43 +0000
Subject: [PATCH 52/67] update version number

---
 setup.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/setup.py b/setup.py
index 8882c3cc..568abaca 100644
--- a/setup.py
+++ b/setup.py
@@ -53,11 +53,7 @@
 
 setuptools.setup(
     name="cazy_webscraper",
-<<<<<<< HEAD
     version="2.2.5",
-=======
-    version="2.2.3",
->>>>>>> update version number
     # Metadata
     author="Emma E. M. Hobbs",
     author_email="eemh1@st-andrews.ac.uk",
@@ -92,7 +88,6 @@
     },
     install_requires=[
         "biopython>=1.76",
-        "bioservices>=1.10.4",
         "mechanicalsoup",
         "pandas>=1.0.3",
         "pyyaml",

From 9a896e78eda2af7acfacae3caca6701307bc5d1d Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 6 Jan 2023 16:57:26 +0000
Subject: [PATCH 53/67] log when cache and downloaded seqs don't match

use downloaded seq and overwrite cached seq
---
 .../expand/genbank/sequences/get_genbank_sequences.py        | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index 68d1ae1c..b5cb94ff 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -400,7 +400,10 @@ def get_seqs_from_ncbi(
             seq = seq_dict[record.id]
             if seq != record.seq:
                 logger.warning(
-                    f""
+                    f"Sequence downloaded from NCBI for protein '{record.id}' does not match the seq\n"
+                    "retrieved from the cache:\n"
+                    f"Downloaded seq:\n{record.seq}\nSeq from cache:\n{seq}\n"
+                    "Overwriting seq from cache and using downloaded seq"
                 )
             seq_dict[record.id] = record.seq
         except KeyError:

From d4ecf03f5a9972a96d7feb51365893357270021d Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Fri, 6 Jan 2023 17:06:19 +0000
Subject: [PATCH 54/67] correct syntax errors

add missing commas, remove unneeded brackets
---
 .../expand/genbank/sequences/get_genbank_sequences.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index b5cb94ff..85de867c 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -151,7 +151,7 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
 
     else:  # retrieve data from NCBI for seqs in the local db
         seq_dict = get_seqs_from_ncbi(
-            accs_seqs_to_retrieve
+            accs_seqs_to_retrieve,
             seq_dict,
             start_time,
             cache_dir,
@@ -318,13 +318,13 @@ def get_records_to_retrieve(
                 del gbk_dict[gbk_acc]
 
     gbk_dict = get_selected_gbks.get_ids(all_accessions, connection)
-    accs_to_retrieve = [for acc in all_accessions if acc not in list(seq_dict.keys())]
+    accs_to_retrieve = [acc for acc in all_accessions if acc not in list(seq_dict.keys())]
 
     return gbk_dict, accs_to_retrieve
 
 
 def get_seqs_from_ncbi(
-    accs_seqs_to_retrieve
+    accs_seqs_to_retrieve,
     seq_dict,
     start_time,
     cache_dir,
@@ -542,7 +542,7 @@ def parse_failed_connections(
     failed_connection_cache = cache_dir / "failed_entrez_connection_accessions"
     failed_batches = [failed_connections_batches[batch_name]['batch'] for batch_name in failed_connections_batches]
 
-    while (len(list(failed_connections_batches.keys())) != 0:
+    while len(list(failed_connections_batches.keys())) != 0:
         # remove processed batches from failed_batches
         for batch in failed_batches:
             try:

From b0f13841f95d436a25cd8c0fa7a9dcbf854e5536 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 10 Jan 2023 11:50:31 +0000
Subject: [PATCH 55/67] add missing args to func calls

---
 .../sequences/get_genbank_sequences.py        | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index 85de867c..5f474c2b 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -140,6 +140,7 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
         seq_dict,
         start_time,
         connection,
+        cache_dir,
         args,
     )
 
@@ -256,6 +257,7 @@ def get_records_to_retrieve(
     seq_dict,
     start_time,
     connection,
+    cache_dir,
     args,
 ):
     """Get Genbank accessions to retrieved data for from NCBI and genbank accessions and local 
@@ -264,20 +266,21 @@ def get_records_to_retrieve(
     :param seq_dict: dict {id: seq} of seqs retrieved from cache json/fasta files
     :param start_time: str: time program was called
     :param connection: open connection to a SQLite db engine
+    :param cache_dir: Path to cache directory
     :param args: CLI args parser
 
     Return a dict {gbk_acc: gbk db id} and list of genbank accs to retrieve seqs for.
     """
     logger = logging.getLogger(__name__)
 
-    all_accessions = list(seq_dict.keys())
+    all_accessions = list(seq_dict.keys())  # cache + (file or db)
 
     if args.file_only:
         gbk_dict = get_selected_gbks.get_ids(all_accessions, connection)
         accs_to_retrieve = []
-        return gbk_dict, accs_seqs_to_retrieve
+        return gbk_dict, accs_to_retrieve
 
-    accessions = []
+    accessions_of_interest = []  # acc from file or from db that match provided criteria
 
     # retrieve dict of genbank accession and genbank db ids from the local CAZyme db
     if args.genbank_accessions is not None:
@@ -295,12 +298,13 @@ def get_records_to_retrieve(
             )
             closing_message("Get GenBank seqs", start_time, args, early_term=True)
 
-        accessions = [line.strip() for line in lines if line not in list(seq_dict.keys())]
-        all_accessions += list(set(accessions))
+        # don't add accessions that are already in the cache
+        accessions_of_interest = [line.strip() for line in lines if line not in list(seq_dict.keys())]
+        all_accessions += list(set(accessions_of_interest))
 
     else:
         logger.warning("Getting GenBank accessions of proteins matching the provided criteria from the local CAZyme db")
-        selected_gbk_dict = get_selected_gbks.get_genbank_accessions(
+        gbk_dict = get_selected_gbks.get_genbank_accessions(
             class_filters,
             family_filters,
             taxonomy_filter_dict,
@@ -309,18 +313,13 @@ def get_records_to_retrieve(
             connection,
         )
 
-        accessions = list(gbk_dict.keys())
-        all_accessions += accessions
-
-        # don't retrieve data for cached seqs
-        for gbk_acc in gbk_accs:
-            if gbk_acc in list(seq_dict.keys()):
-                del gbk_dict[gbk_acc]
+        # don't get seq for accession retrieved from cache
+        accessions_of_interest = [acc for acc in gbk_dict if acc not in list(seq_dict.keys())]
+        all_accessions += accessions_of_interest
 
-    gbk_dict = get_selected_gbks.get_ids(all_accessions, connection)
-    accs_to_retrieve = [acc for acc in all_accessions if acc not in list(seq_dict.keys())]
+    gbk_dict = get_selected_gbks.get_ids(set(all_accessions), connection, cache_dir)
 
-    return gbk_dict, accs_to_retrieve
+    return gbk_dict, accessions_of_interest
 
 
 def get_seqs_from_ncbi(

From 53198c12cf619ea7a6c317865cb4e9ce3310e910 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 10 Jan 2023 11:50:43 +0000
Subject: [PATCH 56/67] add missing IncompeteRead import

---
 cazy_webscraper/ncbi/sequences/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cazy_webscraper/ncbi/sequences/__init__.py b/cazy_webscraper/ncbi/sequences/__init__.py
index 1f56f575..96518a87 100644
--- a/cazy_webscraper/ncbi/sequences/__init__.py
+++ b/cazy_webscraper/ncbi/sequences/__init__.py
@@ -41,6 +41,8 @@
 
 import logging
 
+from http.client import IncompleteRead
+
 from Bio import Entrez
 from saintBioutils.genbank import entrez_retry
 from saintBioutils.misc import get_chunks_list

From 6bb4849da393bb6bbf22fbe75ff8f756b3787e34 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 10 Jan 2023 11:51:09 +0000
Subject: [PATCH 57/67] add missing IncompeteRead import

---
 cazy_webscraper/ncbi/sequences/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cazy_webscraper/ncbi/sequences/__init__.py b/cazy_webscraper/ncbi/sequences/__init__.py
index 96518a87..0f45c47d 100644
--- a/cazy_webscraper/ncbi/sequences/__init__.py
+++ b/cazy_webscraper/ncbi/sequences/__init__.py
@@ -44,6 +44,7 @@
 from http.client import IncompleteRead
 
 from Bio import Entrez
+from Bio.Entrez.Parser import NotXMLError
 from saintBioutils.genbank import entrez_retry
 from saintBioutils.misc import get_chunks_list
 from tqdm import tqdm

From 2af746996bf434b8cd982312b8354a9c42ab052b Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 10 Jan 2023 11:52:08 +0000
Subject: [PATCH 58/67] correct . to , typo

---
 .../expand/genbank/sequences/get_genbank_sequences.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index 5f474c2b..04db7f88 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -373,7 +373,7 @@ def get_seqs_from_ncbi(
             accs_still_to_fetch,
             cache_dir,
             cache_path,
-            batches_with_invalid_ids.
+            batches_with_invalid_ids,
             invalid_ids_cache, 
             args,
         )

From 31193469872af3c13d41a039dbc2a95a626e3f93 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Tue, 10 Jan 2023 12:02:47 +0000
Subject: [PATCH 59/67] add missing args param to func calls

---
 .../sequences/get_genbank_sequences.py        | 38 +++++++++++++------
 cazy_webscraper/ncbi/sequences/__init__.py    |  9 +++--
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index 04db7f88..ae143555 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -460,7 +460,7 @@ def get_seqs(batches, cache_path, invalid_ids_cache, args, disable_prg_bar=False
     successful_accessions = []  # accessions for which seqs were retrieved
 
     for batch in tqdm(batches, desc="Batch querying NCBI.Entrez", disable=disable_prg_bar):
-        epost_webenv, epost_query_key, success = post_accessions_to_entrez(batch)
+        epost_webenv, epost_query_key, success = post_accessions_to_entrez(batch, args)
 
         if success == "Invalid ID":
             # invalid ID, cache, and do not query again
@@ -478,14 +478,20 @@ def get_seqs(batches, cache_path, invalid_ids_cache, args, disable_prg_bar=False
 
         elif success == "Failed connection":
             # Connection failed so retry later
-            failed_connections_batches[str(batch)] = {
+            failed_connections_batches["_".join(batch)] = {
                 'batch': batch,
                 '#_of_attempts': 1 
             }
             continue
         
         # else was a success, so proceed to retrieving protein seqs
-        seq_records, success, temp_successful_accessions = fetch_ncbi_seqs(seq_records, epost_webenv, epost_query_key, gbk_acc_to_retrieve)
+        seq_records, success, temp_successful_accessions = fetch_ncbi_seqs(
+            seq_records,
+            epost_webenv,
+            epost_query_key,
+            gbk_acc_to_retrieve,
+            args,
+        )
         
         if success == "Invalid ID":
             invalid_ids.add(batch[0])
@@ -499,7 +505,7 @@ def get_seqs(batches, cache_path, invalid_ids_cache, args, disable_prg_bar=False
             continue
 
         elif success == "Failed connection":
-            failed_connections_batches[str(batch)] = {
+            failed_connections_batches["_".join(batch)] = {
                 'batch': batch,
                 '#_of_attempts': 1 
             }
@@ -539,13 +545,21 @@ def parse_failed_connections(
     """
     logger = logging.getLogger(__name__)
     failed_connection_cache = cache_dir / "failed_entrez_connection_accessions"
-    failed_batches = [failed_connections_batches[batch_name]['batch'] for batch_name in failed_connections_batches]
+    failed_batches = []
+    for batch in failed_connections_batches:
+        failed_batches.append(failed_connections_batches["_".join(batch)]['batch'])
+
+    print('******************************************************')
+    print(failed_connections_batches)
+    print('------------------------------------------------------')
+    print(failed_batches)
+    print('******************************************************')
 
     while len(list(failed_connections_batches.keys())) != 0:
         # remove processed batches from failed_batches
         for batch in failed_batches:
             try:
-                failed_connections_batches[str(batch)]
+                failed_connections_batches["_".join(batch)]
             except KeyError:
                 # batch has been processed and no longer in failed_connections
                 # delete batch from list
@@ -575,20 +589,20 @@ def parse_failed_connections(
                 if (batch not in new_batches_with_invalid_ids) and (batch not in list(new_failed_connections_batches.keys())):
                     # not in invalid IDs or new failed batches, presume batch was processed
                     failed_batches.remove(batch)
-                    del failed_connections_batches[str(batch)]
+                    del failed_connections_batches["_".join(batch)]
 
             batches_with_invalid_ids += new_batches_with_invalid_ids
             # remove batches with invalid IDs, so don't retry connection
             for batch in batches_with_invalid_ids:
-                failed_connections_batches[str(batch)]
+                failed_connections_batches["_".join(batch)]
                 failed_batches.remove(batch)
 
             # remove batches that were processed successfully and 
             # increate attempt count for batches whose connections failed
             for batch in new_failed_connections_batches:
-                failed_connections_batches[str(batch)]
+                failed_connections_batches["_".join(batch)]
 
-                if failed_connections_batches[str(failed_connections_batches)]['#_of_attempts'] >= args.retries:
+                if failed_connections_batches["_".join(batch)]['#_of_attempts'] >= args.retries:
                     logger.error(
                         "Ran out of connection attempts for the following batch:\n"
                         f"{batch}\n"
@@ -597,10 +611,10 @@ def parse_failed_connections(
                     with open(failed_connection_cache, "a") as fh:
                         for protein_acc in batch:
                             fh.write(f"{protein_acc}\n")
-                    del failed_connections_batches[str(batch)]
+                    del failed_connections_batches["_".join(batch)]
 
                 else:
-                    failed_connections_batches[str(failed_connections_batches)]['#_of_attempts'] += 1
+                    failed_connections_batches["_".join(batch)]['#_of_attempts'] += 1
 
     return seq_records, batches_with_invalid_ids
 
diff --git a/cazy_webscraper/ncbi/sequences/__init__.py b/cazy_webscraper/ncbi/sequences/__init__.py
index 0f45c47d..6bab937f 100644
--- a/cazy_webscraper/ncbi/sequences/__init__.py
+++ b/cazy_webscraper/ncbi/sequences/__init__.py
@@ -50,11 +50,11 @@
 from tqdm import tqdm
  
 
-def post_accessions_to_entrez(batch):
+def post_accessions_to_entrez(batch, args):
     """Post NCBI protein accessions to NCBI via Entrez, and capture error message if one returned.
 
     :param batch: list of genbank accessions
-    :param entrez_function: Entrez function (epost or efetch)
+    :param args: CLI args parser
 
     Return Entrez ePost web environment and query key.
     """
@@ -128,7 +128,7 @@ def post_accessions_to_entrez(batch):
 
     except (TypeError, AttributeError) as err:  # if no record is returned from call to Entrez
         logger.warning(
-            f"Error occurenced when batch quering NCBI\n"
+            f"Error occurenced when batch posting IDs to NCBI\n"
             "Error retrieved:\n"
             f"{repr(err)}\n"
             "Will retry batch later"
@@ -156,13 +156,14 @@ def post_accessions_to_entrez(batch):
     return epost_webenv, epost_query_key, success
 
 
-def fetch_ncbi_seqs(seq_records, epost_webenv, epost_query_key, acc_to_retrieve):
+def fetch_ncbi_seqs(seq_records, epost_webenv, epost_query_key, acc_to_retrieve, args):
     """Retrieve protein sequences from NCBI from ePost of protein v.accs.
 
     :param seq_records: list of Bio.SeqRecords
     :param epost_websenv: Entrez ePost webenvironment
     :param epost_query_key: Entrez ePost query key
     :param acc_to_retrieve: set of NCBI protein version accessions to retrieve seqs for
+    :param args: CLI args parser
 
     Return updated list of SeqRecords and string marking success/failure or seq retrieval.
     """

From 8ce63b6136faaae7ee1a531487c95a2ff41cb247 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Wed, 11 Jan 2023 09:31:51 +0000
Subject: [PATCH 60/67] use joined list as str as failed connections key

---
 .../expand/genbank/sequences/get_genbank_sequences.py    | 4 ++--
 cazy_webscraper/ncbi/sequences/__init__.py               | 9 ++++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index ae143555..bc893296 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -546,8 +546,8 @@ def parse_failed_connections(
     logger = logging.getLogger(__name__)
     failed_connection_cache = cache_dir / "failed_entrez_connection_accessions"
     failed_batches = []
-    for batch in failed_connections_batches:
-        failed_batches.append(failed_connections_batches["_".join(batch)]['batch'])
+    for batch_name in failed_connections_batches:
+        failed_batches.append(failed_connections_batches[batch_name]['batch'])
 
     print('******************************************************')
     print(failed_connections_batches)
diff --git a/cazy_webscraper/ncbi/sequences/__init__.py b/cazy_webscraper/ncbi/sequences/__init__.py
index 6bab937f..1129d92b 100644
--- a/cazy_webscraper/ncbi/sequences/__init__.py
+++ b/cazy_webscraper/ncbi/sequences/__init__.py
@@ -43,7 +43,10 @@
 
 from http.client import IncompleteRead
 
-from Bio import Entrez
+from Bio import Entrez, SeqIO
+from Bio.Entrez.Parser import NotXMLError
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
 from Bio.Entrez.Parser import NotXMLError
 from saintBioutils.genbank import entrez_retry
 from saintBioutils.misc import get_chunks_list
@@ -137,7 +140,7 @@ def post_accessions_to_entrez(batch, args):
 
     except Exception as err:  # if no record is returned from call to Entrez
         logger.warning(
-            f"Error occurenced when batch quering NCBI\n"
+            f"Error occurenced when posting IDs to NCBI\n"
             "Error retrieved:\n"
             f"{repr(err)}\n"
             "Will retry batch later"
@@ -278,7 +281,7 @@ def fetch_ncbi_seqs(seq_records, epost_webenv, epost_query_key, acc_to_retrieve,
 
     except Exception as err:  # if no record is returned from call to Entrez
         logger.warning(
-            f"Error occurenced when batch quering NCBI\n"
+            f"Error occurenced when fetching sequences from NCBI\n"
             "Error retrieved:\n"
             f"{repr(err)}\n"
             "Will retry batch later"

From 63f869b9ecc25eb8bd769f6f1c6657c6ded63035 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Wed, 11 Jan 2023 09:32:06 +0000
Subject: [PATCH 61/67] correct typos success to successful

---
 cazy_webscraper/ncbi/sequences/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cazy_webscraper/ncbi/sequences/__init__.py b/cazy_webscraper/ncbi/sequences/__init__.py
index 1129d92b..b721c970 100644
--- a/cazy_webscraper/ncbi/sequences/__init__.py
+++ b/cazy_webscraper/ncbi/sequences/__init__.py
@@ -288,4 +288,4 @@ def fetch_ncbi_seqs(seq_records, epost_webenv, epost_query_key, acc_to_retrieve,
         )
         success = "Failed connection"
 
-    return seq_records, success, success_accessions
+    return seq_records, success, successful_accessions

From 2d08be9c4687555eabf89e0a405ad5fb4edf52f8 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Wed, 11 Jan 2023 09:33:48 +0000
Subject: [PATCH 62/67] correct old var name to new var name:
 gbk_acc_to_retrieve to acc_to_retrieve

---
 cazy_webscraper/ncbi/sequences/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cazy_webscraper/ncbi/sequences/__init__.py b/cazy_webscraper/ncbi/sequences/__init__.py
index b721c970..b046eeb9 100644
--- a/cazy_webscraper/ncbi/sequences/__init__.py
+++ b/cazy_webscraper/ncbi/sequences/__init__.py
@@ -188,7 +188,7 @@ def fetch_ncbi_seqs(seq_records, epost_webenv, epost_query_key, acc_to_retrieve,
 
                 # check if multiple items returned in ID
                 try:
-                    retrieved_accession = [_ for _ in record.id.split("|") if _.strip() in gbk_acc_to_retrieve][0]
+                    retrieved_accession = [_ for _ in record.id.split("|") if _.strip() in acc_to_retrieve][0]
                 except IndexError:
                     # try re search for accession in string
                     try:

From 771b3d18c1e49a4850a8e096ae53add0b542c8d4 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Wed, 11 Jan 2023 09:36:36 +0000
Subject: [PATCH 63/67] correct list to set: successful_accessions

---
 cazy_webscraper/ncbi/sequences/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cazy_webscraper/ncbi/sequences/__init__.py b/cazy_webscraper/ncbi/sequences/__init__.py
index b046eeb9..e96df274 100644
--- a/cazy_webscraper/ncbi/sequences/__init__.py
+++ b/cazy_webscraper/ncbi/sequences/__init__.py
@@ -171,7 +171,7 @@ def fetch_ncbi_seqs(seq_records, epost_webenv, epost_query_key, acc_to_retrieve,
     Return updated list of SeqRecords and string marking success/failure or seq retrieval.
     """
     logger = logging.getLogger(__name__)
-    success, successful_accessions = None, []
+    success, successful_accessions = None, set()
 
     try:
         with entrez_retry(
@@ -288,4 +288,4 @@ def fetch_ncbi_seqs(seq_records, epost_webenv, epost_query_key, acc_to_retrieve,
         )
         success = "Failed connection"
 
-    return seq_records, success, successful_accessions
+    return seq_records, success, list(successful_accessions)

From 79350d2f3194dad92874a68dfa84eb81cda18295 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Wed, 11 Jan 2023 10:17:11 +0000
Subject: [PATCH 64/67] remove whitespace from ids

---
 cazy_webscraper/ncbi/sequences/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cazy_webscraper/ncbi/sequences/__init__.py b/cazy_webscraper/ncbi/sequences/__init__.py
index e96df274..af66934b 100644
--- a/cazy_webscraper/ncbi/sequences/__init__.py
+++ b/cazy_webscraper/ncbi/sequences/__init__.py
@@ -188,7 +188,7 @@ def fetch_ncbi_seqs(seq_records, epost_webenv, epost_query_key, acc_to_retrieve,
 
                 # check if multiple items returned in ID
                 try:
-                    retrieved_accession = [_ for _ in record.id.split("|") if _.strip() in acc_to_retrieve][0]
+                    retrieved_accession = [_.strip() for _ in record.id.split("|") if _.strip() in acc_to_retrieve][0]
                 except IndexError:
                     # try re search for accession in string
                     try:

From 94d59d1f0608e645ba239107824c69028d78b3b6 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Wed, 11 Jan 2023 14:31:33 +0000
Subject: [PATCH 65/67] fix failed_connections bug

do not remove batch from dict of failed batches multiple times, only once finished parsing batch
---
 .../sequences/get_genbank_sequences.py        | 26 ++++++-------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
index bc893296..f6a49a3d 100644
--- a/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
+++ b/cazy_webscraper/expand/genbank/sequences/get_genbank_sequences.py
@@ -98,7 +98,7 @@ def main(argv: Optional[List[str]] = None, logger: Optional[logging.Logger] = No
         cache_dir = args.cache_dir
         make_output_directory(cache_dir, args.force, args.nodelete_cache)
     else:
-        cache_dir = cache_dir / "genbank_data_retrieval"
+        cache_dir = cache_dir / "genbank_seq_retrieval"
         make_output_directory(cache_dir, args.force, args.nodelete_cache)
 
     (
@@ -366,7 +366,6 @@ def get_seqs_from_ncbi(
         (
             seq_records,
             batches_with_invalid_ids,
-            accs_still_to_fetch,
         ) = parse_failed_connections(
             failed_connections_batches,
             seq_records,
@@ -374,7 +373,7 @@ def get_seqs_from_ncbi(
             cache_dir,
             cache_path,
             batches_with_invalid_ids,
-            invalid_ids_cache, 
+            invalid_ids_cache,
             args,
         )
 
@@ -385,7 +384,7 @@ def get_seqs_from_ncbi(
         for batch in batches_with_invalid_ids:
             for acc in batch:
                 individual_batches.append([acc])
-        
+
         seq_records, accs_still_to_fetch = parse_invalid_id_batches(
             batches_with_invalid_ids,
             cache_path,
@@ -426,7 +425,7 @@ def get_seqs_from_ncbi(
                 fh.write(f"{acc}\n")
 
     logger.warning(f"Retrieved {len(seq_records)} from NCBI")
-    
+
     return seq_dict
 
 
@@ -549,12 +548,6 @@ def parse_failed_connections(
     for batch_name in failed_connections_batches:
         failed_batches.append(failed_connections_batches[batch_name]['batch'])
 
-    print('******************************************************')
-    print(failed_connections_batches)
-    print('------------------------------------------------------')
-    print(failed_batches)
-    print('******************************************************')
-
     while len(list(failed_connections_batches.keys())) != 0:
         # remove processed batches from failed_batches
         for batch in failed_batches:
@@ -589,7 +582,6 @@ def parse_failed_connections(
                 if (batch not in new_batches_with_invalid_ids) and (batch not in list(new_failed_connections_batches.keys())):
                     # not in invalid IDs or new failed batches, presume batch was processed
                     failed_batches.remove(batch)
-                    del failed_connections_batches["_".join(batch)]
 
             batches_with_invalid_ids += new_batches_with_invalid_ids
             # remove batches with invalid IDs, so don't retry connection
@@ -599,10 +591,8 @@ def parse_failed_connections(
 
             # remove batches that were processed successfully and 
             # increate attempt count for batches whose connections failed
-            for batch in new_failed_connections_batches:
-                failed_connections_batches["_".join(batch)]
-
-                if failed_connections_batches["_".join(batch)]['#_of_attempts'] >= args.retries:
+            for batch_name in new_failed_connections_batches:
+                if failed_connections_batches[batch_name]['#_of_attempts'] >= args.retries:
                     logger.error(
                         "Ran out of connection attempts for the following batch:\n"
                         f"{batch}\n"
@@ -611,10 +601,10 @@ def parse_failed_connections(
                     with open(failed_connection_cache, "a") as fh:
                         for protein_acc in batch:
                             fh.write(f"{protein_acc}\n")
-                    del failed_connections_batches["_".join(batch)]
+                    del failed_connections_batches[batch_name]
 
                 else:
-                    failed_connections_batches["_".join(batch)]['#_of_attempts'] += 1
+                    failed_connections_batches[batch_name]['#_of_attempts'] += 1
 
     return seq_records, batches_with_invalid_ids
 

From 4324751b9a77f74a47108f1342e091bee341fdb8 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Wed, 11 Jan 2023 15:28:40 +0000
Subject: [PATCH 66/67] update docs on cache files

---
 docs/cache.rst | 70 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 51 insertions(+), 19 deletions(-)

diff --git a/docs/cache.rst b/docs/cache.rst
index 069355d1..d6d5248c 100644
--- a/docs/cache.rst
+++ b/docs/cache.rst
@@ -33,22 +33,21 @@ The table contains the following columns and data:
 
 The 'Logs' table allows any one who uses the database to see how the dataset was compiled.
 
------------
-Cache files
------------
 
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+------------------------------------------
 Cache files when retrieving data from CAZy
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+------------------------------------------
 
 ``cazy_webscraper`` writes out 2 cache files. These are:
 
 * ``cazy_db_<date-time>.zip`` which is the txt file downloaded from CAZy containing all CAZy Data
 * ``<db_name>_<date-time>_connection_failures.log`` which contains a list of proteins for which data was unsuccessfully parsed, and CAZy families for which the family member population could not be retrieved (if the 'validation' option is used).
 
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+---------------------------------------------
 Cache files when retrieving data from UniProt
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+---------------------------------------------
 
 The dataframes retrieved with each query to UniProt are cached.
 
@@ -56,7 +55,9 @@ In addition two JSON files are created:
 * ``uniprot_accessions_YYYY-MM-DD_HH-MM-SS.json``
 * ``uniprot_data_YYYY-MM-DD_HH-MM-SS.json``
 
-**UniProt accessions:**
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+UniProt accessions JSON file
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The first file (``uniprot_accessions``) contains the UniProt accessions/IDs for each GenBank accession retrieved 
 from the local CAZyme database that matches the provided criteria. These UniProt IDs are used to query 
@@ -72,7 +73,10 @@ containing the GenBank accession the corresponding ID of its record in the local
 This file can be used to skip the retrieval of UniProt IDs from UniProt (which is the first step performed by ``cw_get_uniprot_data``). To 
 do this use the ``--skip_uniprot_accessions`` flag followed by the path to the corresponding ``uniprot_accessions_YYYY-MM-DD_HH-MM-SS.json`` file.
 
-**UniProt data:**
+
+^^^^^^^^^^^^^^^^^^^^^^
+UniProt data JSON file
+^^^^^^^^^^^^^^^^^^^^^^
 
 The second json file (``uniprot_data``) contains all data retrieved from UniProt for all proteins in the local 
 CAZyme database that match the specified criteria. The data retrieved from UniProt was parsed into a Python dictionary 
@@ -85,20 +89,28 @@ To use the data cached in the ``uniprot_data`` file, using the ``--use_uniprot_c
 path pointing to the corresponding file. Using this flag, skips the retrieval of protein data from UniProt, and only adds 
 data from the cache file into the local CAZyme database.
 
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Cache files when retrieving data from GenBank
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+----------------------------------------------------------
+Cache files when retrieving protein sequences from GenBank
+----------------------------------------------------------
 
-When retrieving protein sequences from GenBank three cache file are produced:
+When retrieving protein sequences from GenBank, all cache files will be contained in the 
+``genbank_seq_retrieval.tar`` compressed file. The cache files that will be generated are:
 
-* ``genbank_sequence_retrieved_YYYY-MM-DD_HH-MM-SS.txt`` contains the GenBank protein accessions for which a protein sequence **was** successfully retrieved
-* ``genbank_no_sequence_YYYY-MM-DD_HH-MM-SS.txt`` contains the GenBank protein accessions for which a protein sequence was not retrieved, as well as the possible reason
-* ``gennbank_seqs_YYYY-MM-DD_HH-MM-SS.json``, which contains the GenBank accessions of proteins matching the provided criteria and the retrieved protein sequences.
+ three cache file are produced:
+
+* ``cw_genbanks_seqs_<cache_time>.fasta`` contains the protein sequences downloaded from GenBank
+* ``failed_retrieve_ids`` contains the GenBank accessions containing the IDs of **all** proteins for whom no protein sequence was retrieved
+* ``failed_entrez_connection_accessions`` contains the GenBank accessions of proteins whom no protein sequence was retrieved owing to failure to connect to NCBI
+* ``invalid_ids`` contains GenBank protein version accessions that were retrieved from CAZy, but are no longer listed in NCBI
 
 .. NOTE::
-    Future features for ``cazy_webscraper`` will include the ability to use this cached protein sequences, to (i) skip the 
+    ``cazy_webscraper`` includes the option to use cached protein sequences, to (i) skip the 
     retrieval of data from GenBank and (ii) facilitate the reproduction of local CAZyme databases.
 
+-----------------------------------------------------------
+Cache files when retrieving taxonomic information from NCBI
+-----------------------------------------------------------
+
 When retrieving taxonomic classifications the following cache files are generated:
 * ``failed_protein_accs.txt`` - list the version accession of all proteins for which not taxonomy data could be retrieved from NCBI (possibly the protein record has been removed)
 * ``lineage_data.json`` - lists the lineage data and the associated protein sequences accessions
@@ -109,13 +121,33 @@ When retrieving taxonomic classifications the following cache files are generate
 When retrieving genomic assembly data a file listing all protein sequence accessions for which no data was retrieved is cached (``failed_protein__accessions.txt``). 
 Additionally, the downloaded genomic assemlby feature tables are retained in their compress format and cached.
 
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-----------------------------------------
 Cache files when retrieving data from PDB
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-----------------------------------------
 
 One cache file is created when using ``cazy_webscraper`` to retrieval protein structure files from PDB: ``pdb_retrieval_YYYY-MM-DD_HH-MM-SS.txt``, which 
 lists the PDB accessions of all files that were successfully downloaded from PDB using ``cazy_webscraper`` and ``BioPython``.
 
+
+---------------------------------------------------------
+Cache files when retrieving genomic information from NCBI
+---------------------------------------------------------
+
+All genomic assembly feature tables are cached in the cache directory. In addition, two other cache files 
+are generated to list proteins and genomes for whom data could not be retrieved from NCBI:
+
+* ``no_assembly_urls.txt`` contains the NCBI Assembly Name of genomic assemlbies for whom a feature table could not be downloaded from the NCBI FTTP server - typically because a feature table is not available
+* ``failed_protein_accessions.txt`` contains the NCBI protein version accessions for whom genomic assembly information
+
+-----------------------------------------------------------
+Cache files when retrieving taxonomic information from GTDB
+-----------------------------------------------------------
+
+The GTDB database dumbs are downloaded and written to the cache directory:
+
+* ``archaea_data-{release_number}.gz``
+* ``bacteria_data-{release_number}.gz``
+
 ---------------
 Cache directory
 ---------------

From b8268e58275e9b3e022d3c96899129afa4498f08 Mon Sep 17 00:00:00 2001
From: HobnobMancer <eemh1@st-andrews.ac.uk>
Date: Wed, 11 Jan 2023 15:47:31 +0000
Subject: [PATCH 67/67] change sqlalchemy requirement to fix v num

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ea817a57..a9c39715 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,5 +8,5 @@ numpy
 pandas
 requests
 saintBioutils>=0.0.19
-sqlalchemy>=1.4.20
+sqlalchemy==1.4.20
 tqdm
diff --git a/setup.py b/setup.py
index 568abaca..751fbef2 100644
--- a/setup.py
+++ b/setup.py
@@ -93,7 +93,7 @@
         "pyyaml",
         "requests",
         "saintBioutils>=0.0.24",
-        "sqlalchemy>=1.4.20",
+        "sqlalchemy==1.4.20",
         "tqdm",
     ],
     packages=setuptools.find_packages(),