BojarLab · Old-Shatterhand · Nov 11, 2024 · Nov 12, 2024 · Nov 13, 2024 · Nov 14, 2024
diff --git a/bin/glycoworkGUI.py b/bin/glycoworkGUI.py
@@ -121,13 +121,13 @@ def leave(event):
                 self.tooltip.destroy()
         widget.bind('<Enter>', enter)
         widget.bind('<Leave>', leave)
-    
+
     def browse_csv(self):
         file_path = filedialog.askopenfilename(filetypes = [("CSV Files", "*.csv"), ("Excel Files", "*.xlsx")])
         if file_path:
             self.csv_entry.delete(0, tk.END)
             self.csv_entry.insert(0, file_path)
-    
+
     def browse_folder(self):
         folder_path = filedialog.askdirectory()
         if folder_path:
@@ -154,7 +154,7 @@ def openGlycoDrawExcelDialog():
 class DifferentialExpressionDialog(simpledialog.Dialog):
     def body(self, master):
         self.title("Differential Expression Input")
-        
+
         # CSV file selection
         tk.Label(master, text="CSV or Excel File:").grid(row = 0, sticky = tk.W)
         self.csv_file_var = tk.StringVar(master)
@@ -176,23 +176,23 @@ def body(self, master):
         self.output_folder_entry.grid(row = 4, column = 1)
         self.output_folder_browse = tk.Button(master, text = "Browse...", command = self.browse_output_folder)
         self.output_folder_browse.grid(row = 4, column = 2)
-        
+
         # Treatment group indices
         tk.Label(master, text = "Treatment Group Columns:").grid(row = 1, sticky = tk.W)
         self.treatment_entry = tk.Entry(master)
         self.treatment_entry.grid(row = 1, column = 1, columnspan = 2, sticky = tk.W+tk.E)
-        
+
         # Control group indices
         tk.Label(master, text = "Control Group Columns:").grid(row = 2, sticky = tk.W)
         self.control_entry = tk.Entry(master)
         self.control_entry.grid(row = 2, column = 1, columnspan = 2, sticky = tk.W+tk.E)
-        
+
         # Motifs option
         tk.Label(master, text="Motif-based analysis:").grid(row = 3, sticky = tk.W)
         self.motifs_var = tk.BooleanVar(master)
         self.motifs_check = tk.Checkbutton(master, variable = self.motifs_var)
         self.motifs_check.grid(row = 3, column = 1, sticky = tk.W)
-        
+
         return self.csv_entry  # to put focus on the csv file entry widget
 
     def create_tooltip(self, widget, text):
@@ -262,7 +262,7 @@ def openDifferentialExpressionDialog():
 class GetHeatmapDialog(simpledialog.Dialog):
     def body(self, master):
         self.title("Get Heatmap Input")
-        
+
         # Input file selection
         tk.Label(master, text = "Select Input CSV or Excel File:").grid(row = 0, sticky = tk.W)
         self.input_file_entry = tk.StringVar(master)
@@ -276,7 +276,7 @@ def body(self, master):
                             "Ideally, rows are samples and columns are glycans (but the function can deal with the opposite)\n"
                             "Glycans should be ideally in IUPAC-condensed\n"
                             "If you do NOT analyze motifs, the glycan format does not matter at all")
-        
+
         # Motif analysis option
         self.motif_analysis_var = tk.BooleanVar()
         self.motif_analysis_check = tk.Checkbutton(master, text = "Motif Analysis", variable = self.motif_analysis_var)
@@ -291,7 +291,7 @@ def body(self, master):
         self.show_all_var = tk.BooleanVar()
         self.show_all_check = tk.Checkbutton(master, text = "Show all?", variable = self.show_all_var)
         self.show_all_check.grid(row = 1, column = 2, sticky = tk.W)
-        
+
         # Output PDF file selection
         tk.Label(master, text = "Select Output for Heatmap File:").grid(row = 2, sticky = tk.W)
         self.output_file_entry = tk.StringVar(master)
@@ -360,7 +360,7 @@ def openGetHeatmapDialog():
 class LectinArrayAnalysisDialog(simpledialog.Dialog):
     def body(self, master):
         self.title("Lectin Array Analysis Input")
-        
+
         # CSV or Excel file selection
         tk.Label(master, text="Select CSV or Excel File:").grid(row = 0, sticky = tk.W)
         self.file_entry = tk.Entry(master)
@@ -372,17 +372,17 @@ def body(self, master):
         self.create_tooltip(self.help_icon, "CSV Format Help:\n\n"
                             "Format data as samples as rows and lectins as columns (first column = sample names)\n"
                             "Have lectin names in the column names")
-        
+
         # Treatment group indices
         tk.Label(master, text = "Treatment Group Rows (comma-separated):").grid(row = 1, sticky = tk.W)
         self.treatment_entry = tk.Entry(master)
         self.treatment_entry.grid(row = 1, column = 1, columnspan = 2, sticky = tk.W+tk.E)
-        
+
         # Control group indices
         tk.Label(master, text = "Control Group Rows (comma-separated):").grid(row = 2, sticky = tk.W)
         self.control_entry = tk.Entry(master)
         self.control_entry.grid(row = 2, column = 1, columnspan = 2, sticky = tk.W+tk.E)
-        
+
         # Paired analysis option
         tk.Label(master, text = "Paired Analysis:").grid(row = 3, sticky = tk.W)
         self.paired_var = tk.BooleanVar()

diff --git a/glycowork/glycan_data/data_entry.py b/glycowork/glycan_data/data_entry.py
@@ -1,21 +1,17 @@
+import pandas as pd
+from typing import Optional
 from glycowork.motif.processing import check_nomenclature
 from glycowork.motif.graph import glycan_to_nxGraph, compare_glycans
 
 
-def check_presence(glycan, df, colname = 'glycan',
-                   name = None, rank = 'Species', fast = False):
-  """checks whether glycan (of that species) is already present in dataset\n
-  | Arguments:
-  | :-
-  | glycan (string): IUPAC-condensed glycan sequence
-  | df (dataframe): glycan dataframe where glycans are under colname and ideally taxonomic labels are columns
-  | name (string): name of the species (etc.) of interest
-  | rank (string): column name for filtering; default: species
-  | fast (bool): True uses precomputed glycan graphs, only use if df has column 'graph' with glycan graphs\n
-  | Returns:
-  | :-
-  | Returns text output regarding whether the glycan is already in df
-  """
+def check_presence(glycan: str, # IUPAC-condensed glycan sequence
+                  df: pd.DataFrame, # glycan dataframe where glycans are under colname
+                  colname: str = 'glycan', # column name containing glycans
+                  name: Optional[str] = None, # name of species of interest
+                  rank: str = 'Species', # column name for filtering
+                  fast: bool = False # True uses precomputed glycan graphs
+                 ) -> None:
+  "checks whether glycan (of that species) is already present in dataset"
   if any([p in glycan for p in ['RES', '=']]) or not isinstance(glycan, str):
     check_nomenclature(glycan)
     return

diff --git a/glycowork/glycan_data/loader.py b/glycowork/glycan_data/loader.py
@@ -6,7 +6,7 @@
 from os import path
 from itertools import chain
 from importlib import resources
-from typing import Any, Dict, Union
+from typing import Any, Dict, List, Optional
 
 with resources.open_text("glycowork.glycan_data", "glycan_motifs.csv") as f:
   motif_list = pd.read_csv(f)
@@ -85,30 +85,30 @@ def __dir__(self):
                     'OS': {'GlcNAc', 'Gal', 'GalNAc'}}
 
 
-def unwrap(nested_list):
-  """converts a nested list into a flat list"""
+def unwrap(nested_list: List[Any] # list to be flattened
+         ) -> List[Any]: # flattened list
+  "converts a nested list into a flat list"
   return list(chain(*nested_list))
 
 
-def find_nth(haystack, needle, n):
-  """finds n-th instance of motif\n
-  | Arguments:
-  | :-
-  | haystack (string): string to search for motif
-  | needle (string): motif
-  | n (int): n-th occurrence in string (not zero-indexed)\n
-  | Returns:
-  | :-
-  | Returns starting index of n-th occurrence in string
-  """
+def find_nth(haystack: str, # string to search for motif
+            needle: str, # motif
+            n: int # n-th occurrence in string (not zero-indexed)
+           ) -> int: # starting index of n-th occurrence
+  "finds n-th instance of motif"
   start = haystack.find(needle)
   while start >= 0 and n > 1:
     start = haystack.find(needle, start+len(needle))
     n -= 1
   return start
 
 
-def find_nth_reverse(string, substring, n, ignore_branches = False):
+def find_nth_reverse(string: str, # string to search
+                    substring: str, # substring to find
+                    n: int, # n-th occurrence from end
+                    ignore_branches: bool = False # whether to ignore branches when counting
+                   ) -> int: # position of n-th occurrence from end
+  "finds n-th instance of motif from end of string"
   # Reverse the string and the substring
   reversed_string = string[::-1]
   reversed_substring = substring[::-1]
@@ -142,15 +142,9 @@ def find_nth_reverse(string, substring, n, ignore_branches = False):
   return original_start_index
 
 
-def remove_unmatched_brackets(s):
-  """Removes all unmatched brackets from the string s.\n
-  | Arguments:
-  | :-
-  | s (string): glycan string in IUPAC-condensed\n
-  | Returns:
-  | :-
-  | Returns glycan without unmatched brackets
-   """
+def remove_unmatched_brackets(s: str # glycan string in IUPAC-condensed
+                           ) -> str: # glycan without unmatched brackets
+  "Removes all unmatched brackets from the string s"
   while True:
     # Keep track of the indexes of the brackets
     stack = []
@@ -173,48 +167,30 @@ def remove_unmatched_brackets(s):
   return s
 
 
-def reindex(df_new, df_old, out_col, ind_col, inp_col):
-  """Returns columns values in order of new dataframe rows\n
-  | Arguments:
-  | :-
-  | df_new (pandas dataframe): dataframe with the new row order
-  | df_old (pandas dataframe): dataframe with the old row order
-  | out_col (string): column name of column in df_old that you want to reindex
-  | ind_col (string): column name of column in df_old that will give the index
-  | inp_col (string): column name of column in df_new that indicates the new order; ind_col and inp_col should match\n
-  | Returns:
-  | :-
-  | Returns out_col from df_old in the same order of inp_col in df_new
-  """
+def reindex(df_new: pd.DataFrame, # dataframe with new row order
+           df_old: pd.DataFrame, # dataframe with old row order
+           out_col: str, # column name in df_old to reindex
+           ind_col: str, # column name in df_old for index
+           inp_col: str # column name in df_new for new order
+          ) -> list: # out_col from df_old reordered to match inp_col in df_new
+  "Returns columns values in order of new dataframe rows"
   if ind_col != inp_col:
     print("Mismatching column names for ind_col and inp_col. Doesn't mean it's wrong but pay attention.")
   return [df_old[out_col].values.tolist()[df_old[ind_col].values.tolist().index(k)] for k in df_new[inp_col].values.tolist()]
 
 
-def stringify_dict(dicty):
-  """Converts dictionary into a string\n
-  | Arguments:
-  | :-
-  | dicty (dictionary): dictionary\n
-  | Returns:
-  | :-
-  | Returns string of type key:value for sorted items
-  """
+def stringify_dict(dicty: Dict[Any, Any] # dictionary to convert
+                 ) -> str: # string of type key:value for sorted items
+  "Converts dictionary into a string"
   dicty = dict(sorted(dicty.items()))
   return ''.join(f"{key}{value}" for key, value in dicty.items())
 
 
-def replace_every_second(string, old_char, new_char):
-  """function to replace every second occurrence of old_char in string with new_char\n
-  | Arguments:
-  | :-
-  | string (string): a string
-  | old_char (string): a string character to be replaced (every second occurrence)
-  | new_char (string): the string character to replace old_char with\n
-  | Returns:
-  | :-
-  | Returns string with replaced characters
-  """
+def replace_every_second(string: str, # input string
+                        old_char: str, # character to replace
+                        new_char: str # character to replace with
+                       ) -> str: # modified string
+  "function to replace every second occurrence of old_char in string with new_char"
   count = 0
   result = []
   for char in string:
@@ -226,36 +202,25 @@ def replace_every_second(string, old_char, new_char):
   return ''.join(result)
 
 
-def multireplace(string, remove_dic):
-  """Replaces all occurences of items in a set with a given string\n
-  | Arguments:
-  | :-
-  | string (str): string to perform replacements on
-  | remove_dic (set): dict of form to_replace:replace_with\n
-  | Returns:
-  | :-
-  | (str) modified string
-  """
+def multireplace(string: str, # string to perform replacements on
+                remove_dic: Dict[str, str] # dict of form to_replace:replace_with
+               ) -> str: # modified string
+  "Replaces all occurences of items in a set with a given string"
   for k, v in remove_dic.items():
     string = string.replace(k, v)
   return string
 
 
-def strip_suffixes(columns):
-  """Strip numerical suffixes like .1, .2, etc., from column names."""
+def strip_suffixes(columns: List[Any] # column names
+                 ) -> List[str]: # column names without numerical suffixes
+  "Strip numerical suffixes like .1, .2, etc., from column names"
   return [re.sub(r"\.\d+$", "", str(name)) for name in columns]
 
 
-def build_custom_df(df, kind = 'df_species'):
-  """creates custom df from df_glycan\n
-  | Arguments:
-  | :-
-  | df (dataframe): df_glycan / sugarbase
-  | kind (string): whether to create 'df_species', 'df_tissue', or 'df_disease' from df_glycan; default:df_species\n
-  | Returns:
-  | :-
-  | Returns custom df in the form of one glycan - species/tissue/disease association per row
-  """
+def build_custom_df(df: pd.DataFrame, # df_glycan / sugarbase
+                   kind: str = 'df_species' # whether to create 'df_species', 'df_tissue', or 'df_disease'
+                  ) -> pd.DataFrame: # custom df with one glycan - species/tissue/disease association per row
+  "creates custom df from df_glycan"
   kind_to_cols = {
         'df_species': ['glycan', 'Species', 'Genus', 'Family', 'Order', 'Class',
                        'Phylum', 'Kingdom', 'Domain', 'ref'],
@@ -274,8 +239,10 @@ def build_custom_df(df, kind = 'df_species'):
   return df
 
 
-def download_model(file_id, local_path = 'model_weights.pt'):
-  """Download the model weights file from Google Drive."""
+def download_model(file_id: str, # Google Drive file ID
+                  local_path: str = 'model_weights.pt' # where to save model file
+                 ) -> None:
+  "Download the model weights file from Google Drive"
   file_id = file_id.split('/d/')[1].split('/view')[0]
   url = f'https://drive.google.com/uc?id={file_id}'
   gdown.download(url, local_path, quiet = False)
@@ -332,12 +299,10 @@ def _deserialize_cell(cell_data: Dict[str, Any]) -> Any:
       return cell_data['value']
 
   @classmethod
-  def serialize(cls, df: pd.DataFrame, path: str) -> None:
-    """Serialize a DataFrame to JSON with type information.
-
-    Args:
-      df: pandas DataFrame to serialize
-      path: file path to save the serialized data"""
+  def serialize(cls, df: pd.DataFrame, # DataFrame to serialize
+                 path: str # file path to save serialized data
+                ) -> None:
+    "Serialize a DataFrame to JSON with type information"
     data = {
       'columns': list(df.columns),
       'index': list(df.index),
@@ -352,14 +317,9 @@ def serialize(cls, df: pd.DataFrame, path: str) -> None:
       json.dump(data, f)
 
   @classmethod
-  def deserialize(cls, path: str) -> pd.DataFrame:
-    """Deserialize a DataFrame from JSON.
-
-    Args:
-      path: file path to load the serialized data from
-
-    Returns:
-      pandas DataFrame with restored data types"""
+  def deserialize(cls, path: str # file path to load serialized data
+                  ) -> pd.DataFrame: # DataFrame with restored data types
+    "Deserialize a DataFrame from JSON"
     with open(path, 'r') as f:
       data = json.load(f)