Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework of train_model function #65

Closed
wants to merge 8 commits into from
Closed
28 changes: 14 additions & 14 deletions bin/glycoworkGUI.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,13 @@ def leave(event):
self.tooltip.destroy()
widget.bind('<Enter>', enter)
widget.bind('<Leave>', leave)

def browse_csv(self):
file_path = filedialog.askopenfilename(filetypes = [("CSV Files", "*.csv"), ("Excel Files", "*.xlsx")])
if file_path:
self.csv_entry.delete(0, tk.END)
self.csv_entry.insert(0, file_path)

def browse_folder(self):
folder_path = filedialog.askdirectory()
if folder_path:
Expand All @@ -154,7 +154,7 @@ def openGlycoDrawExcelDialog():
class DifferentialExpressionDialog(simpledialog.Dialog):
def body(self, master):
self.title("Differential Expression Input")

# CSV file selection
tk.Label(master, text="CSV or Excel File:").grid(row = 0, sticky = tk.W)
self.csv_file_var = tk.StringVar(master)
Expand All @@ -176,23 +176,23 @@ def body(self, master):
self.output_folder_entry.grid(row = 4, column = 1)
self.output_folder_browse = tk.Button(master, text = "Browse...", command = self.browse_output_folder)
self.output_folder_browse.grid(row = 4, column = 2)

# Treatment group indices
tk.Label(master, text = "Treatment Group Columns:").grid(row = 1, sticky = tk.W)
self.treatment_entry = tk.Entry(master)
self.treatment_entry.grid(row = 1, column = 1, columnspan = 2, sticky = tk.W+tk.E)

# Control group indices
tk.Label(master, text = "Control Group Columns:").grid(row = 2, sticky = tk.W)
self.control_entry = tk.Entry(master)
self.control_entry.grid(row = 2, column = 1, columnspan = 2, sticky = tk.W+tk.E)

# Motifs option
tk.Label(master, text="Motif-based analysis:").grid(row = 3, sticky = tk.W)
self.motifs_var = tk.BooleanVar(master)
self.motifs_check = tk.Checkbutton(master, variable = self.motifs_var)
self.motifs_check.grid(row = 3, column = 1, sticky = tk.W)

return self.csv_entry # to put focus on the csv file entry widget

def create_tooltip(self, widget, text):
Expand Down Expand Up @@ -262,7 +262,7 @@ def openDifferentialExpressionDialog():
class GetHeatmapDialog(simpledialog.Dialog):
def body(self, master):
self.title("Get Heatmap Input")

# Input file selection
tk.Label(master, text = "Select Input CSV or Excel File:").grid(row = 0, sticky = tk.W)
self.input_file_entry = tk.StringVar(master)
Expand All @@ -276,7 +276,7 @@ def body(self, master):
"Ideally, rows are samples and columns are glycans (but the function can deal with the opposite)\n"
"Glycans should be ideally in IUPAC-condensed\n"
"If you do NOT analyze motifs, the glycan format does not matter at all")

# Motif analysis option
self.motif_analysis_var = tk.BooleanVar()
self.motif_analysis_check = tk.Checkbutton(master, text = "Motif Analysis", variable = self.motif_analysis_var)
Expand All @@ -291,7 +291,7 @@ def body(self, master):
self.show_all_var = tk.BooleanVar()
self.show_all_check = tk.Checkbutton(master, text = "Show all?", variable = self.show_all_var)
self.show_all_check.grid(row = 1, column = 2, sticky = tk.W)

# Output PDF file selection
tk.Label(master, text = "Select Output for Heatmap File:").grid(row = 2, sticky = tk.W)
self.output_file_entry = tk.StringVar(master)
Expand Down Expand Up @@ -360,7 +360,7 @@ def openGetHeatmapDialog():
class LectinArrayAnalysisDialog(simpledialog.Dialog):
def body(self, master):
self.title("Lectin Array Analysis Input")

# CSV or Excel file selection
tk.Label(master, text="Select CSV or Excel File:").grid(row = 0, sticky = tk.W)
self.file_entry = tk.Entry(master)
Expand All @@ -372,17 +372,17 @@ def body(self, master):
self.create_tooltip(self.help_icon, "CSV Format Help:\n\n"
"Format data as samples as rows and lectins as columns (first column = sample names)\n"
"Have lectin names in the column names")

# Treatment group indices
tk.Label(master, text = "Treatment Group Rows (comma-separated):").grid(row = 1, sticky = tk.W)
self.treatment_entry = tk.Entry(master)
self.treatment_entry.grid(row = 1, column = 1, columnspan = 2, sticky = tk.W+tk.E)

# Control group indices
tk.Label(master, text = "Control Group Rows (comma-separated):").grid(row = 2, sticky = tk.W)
self.control_entry = tk.Entry(master)
self.control_entry.grid(row = 2, column = 1, columnspan = 2, sticky = tk.W+tk.E)

# Paired analysis option
tk.Label(master, text = "Paired Analysis:").grid(row = 3, sticky = tk.W)
self.paired_var = tk.BooleanVar()
Expand Down
24 changes: 10 additions & 14 deletions glycowork/glycan_data/data_entry.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
import pandas as pd
from typing import Optional
from glycowork.motif.processing import check_nomenclature
from glycowork.motif.graph import glycan_to_nxGraph, compare_glycans


def check_presence(glycan, df, colname = 'glycan',
name = None, rank = 'Species', fast = False):
"""checks whether glycan (of that species) is already present in dataset\n
| Arguments:
| :-
| glycan (string): IUPAC-condensed glycan sequence
| df (dataframe): glycan dataframe where glycans are under colname and ideally taxonomic labels are columns
| name (string): name of the species (etc.) of interest
| rank (string): column name for filtering; default: species
| fast (bool): True uses precomputed glycan graphs, only use if df has column 'graph' with glycan graphs\n
| Returns:
| :-
| Returns text output regarding whether the glycan is already in df
"""
def check_presence(glycan: str, # IUPAC-condensed glycan sequence
df: pd.DataFrame, # glycan dataframe where glycans are under colname
colname: str = 'glycan', # column name containing glycans
name: Optional[str] = None, # name of species of interest
rank: str = 'Species', # column name for filtering
fast: bool = False # True uses precomputed glycan graphs
) -> None:
"checks whether glycan (of that species) is already present in dataset"
if any([p in glycan for p in ['RES', '=']]) or not isinstance(glycan, str):
check_nomenclature(glycan)
return
Expand Down
150 changes: 55 additions & 95 deletions glycowork/glycan_data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from os import path
from itertools import chain
from importlib import resources
from typing import Any, Dict, Union
from typing import Any, Dict, List, Optional

with resources.open_text("glycowork.glycan_data", "glycan_motifs.csv") as f:
motif_list = pd.read_csv(f)
Expand Down Expand Up @@ -85,30 +85,30 @@ def __dir__(self):
'OS': {'GlcNAc', 'Gal', 'GalNAc'}}


def unwrap(nested_list):
"""converts a nested list into a flat list"""
def unwrap(nested_list: List[Any] # list to be flattened
) -> List[Any]: # flattened list
"converts a nested list into a flat list"
return list(chain(*nested_list))


def find_nth(haystack, needle, n):
"""finds n-th instance of motif\n
| Arguments:
| :-
| haystack (string): string to search for motif
| needle (string): motif
| n (int): n-th occurrence in string (not zero-indexed)\n
| Returns:
| :-
| Returns starting index of n-th occurrence in string
"""
def find_nth(haystack: str, # string to search for motif
needle: str, # motif
n: int # n-th occurrence in string (not zero-indexed)
) -> int: # starting index of n-th occurrence
"finds n-th instance of motif"
start = haystack.find(needle)
while start >= 0 and n > 1:
start = haystack.find(needle, start+len(needle))
n -= 1
return start


def find_nth_reverse(string, substring, n, ignore_branches = False):
def find_nth_reverse(string: str, # string to search
substring: str, # substring to find
n: int, # n-th occurrence from end
ignore_branches: bool = False # whether to ignore branches when counting
) -> int: # position of n-th occurrence from end
"finds n-th instance of motif from end of string"
# Reverse the string and the substring
reversed_string = string[::-1]
reversed_substring = substring[::-1]
Expand Down Expand Up @@ -142,15 +142,9 @@ def find_nth_reverse(string, substring, n, ignore_branches = False):
return original_start_index


def remove_unmatched_brackets(s):
"""Removes all unmatched brackets from the string s.\n
| Arguments:
| :-
| s (string): glycan string in IUPAC-condensed\n
| Returns:
| :-
| Returns glycan without unmatched brackets
"""
def remove_unmatched_brackets(s: str # glycan string in IUPAC-condensed
) -> str: # glycan without unmatched brackets
"Removes all unmatched brackets from the string s"
while True:
# Keep track of the indexes of the brackets
stack = []
Expand All @@ -173,48 +167,30 @@ def remove_unmatched_brackets(s):
return s


def reindex(df_new, df_old, out_col, ind_col, inp_col):
"""Returns columns values in order of new dataframe rows\n
| Arguments:
| :-
| df_new (pandas dataframe): dataframe with the new row order
| df_old (pandas dataframe): dataframe with the old row order
| out_col (string): column name of column in df_old that you want to reindex
| ind_col (string): column name of column in df_old that will give the index
| inp_col (string): column name of column in df_new that indicates the new order; ind_col and inp_col should match\n
| Returns:
| :-
| Returns out_col from df_old in the same order of inp_col in df_new
"""
def reindex(df_new: pd.DataFrame, # dataframe with new row order
df_old: pd.DataFrame, # dataframe with old row order
out_col: str, # column name in df_old to reindex
ind_col: str, # column name in df_old for index
inp_col: str # column name in df_new for new order
) -> list: # out_col from df_old reordered to match inp_col in df_new
"Returns columns values in order of new dataframe rows"
if ind_col != inp_col:
print("Mismatching column names for ind_col and inp_col. Doesn't mean it's wrong but pay attention.")
return [df_old[out_col].values.tolist()[df_old[ind_col].values.tolist().index(k)] for k in df_new[inp_col].values.tolist()]


def stringify_dict(dicty):
"""Converts dictionary into a string\n
| Arguments:
| :-
| dicty (dictionary): dictionary\n
| Returns:
| :-
| Returns string of type key:value for sorted items
"""
def stringify_dict(dicty: Dict[Any, Any] # dictionary to convert
) -> str: # string of type key:value for sorted items
"Converts dictionary into a string"
dicty = dict(sorted(dicty.items()))
return ''.join(f"{key}{value}" for key, value in dicty.items())


def replace_every_second(string, old_char, new_char):
"""function to replace every second occurrence of old_char in string with new_char\n
| Arguments:
| :-
| string (string): a string
| old_char (string): a string character to be replaced (every second occurrence)
| new_char (string): the string character to replace old_char with\n
| Returns:
| :-
| Returns string with replaced characters
"""
def replace_every_second(string: str, # input string
old_char: str, # character to replace
new_char: str # character to replace with
) -> str: # modified string
"function to replace every second occurrence of old_char in string with new_char"
count = 0
result = []
for char in string:
Expand All @@ -226,36 +202,25 @@ def replace_every_second(string, old_char, new_char):
return ''.join(result)


def multireplace(string, remove_dic):
"""Replaces all occurences of items in a set with a given string\n
| Arguments:
| :-
| string (str): string to perform replacements on
| remove_dic (set): dict of form to_replace:replace_with\n
| Returns:
| :-
| (str) modified string
"""
def multireplace(string: str, # string to perform replacements on
remove_dic: Dict[str, str] # dict of form to_replace:replace_with
) -> str: # modified string
"Replaces all occurences of items in a set with a given string"
for k, v in remove_dic.items():
string = string.replace(k, v)
return string


def strip_suffixes(columns):
"""Strip numerical suffixes like .1, .2, etc., from column names."""
def strip_suffixes(columns: List[Any] # column names
) -> List[str]: # column names without numerical suffixes
"Strip numerical suffixes like .1, .2, etc., from column names"
return [re.sub(r"\.\d+$", "", str(name)) for name in columns]


def build_custom_df(df, kind = 'df_species'):
"""creates custom df from df_glycan\n
| Arguments:
| :-
| df (dataframe): df_glycan / sugarbase
| kind (string): whether to create 'df_species', 'df_tissue', or 'df_disease' from df_glycan; default:df_species\n
| Returns:
| :-
| Returns custom df in the form of one glycan - species/tissue/disease association per row
"""
def build_custom_df(df: pd.DataFrame, # df_glycan / sugarbase
kind: str = 'df_species' # whether to create 'df_species', 'df_tissue', or 'df_disease'
) -> pd.DataFrame: # custom df with one glycan - species/tissue/disease association per row
"creates custom df from df_glycan"
kind_to_cols = {
'df_species': ['glycan', 'Species', 'Genus', 'Family', 'Order', 'Class',
'Phylum', 'Kingdom', 'Domain', 'ref'],
Expand All @@ -274,8 +239,10 @@ def build_custom_df(df, kind = 'df_species'):
return df


def download_model(file_id, local_path = 'model_weights.pt'):
"""Download the model weights file from Google Drive."""
def download_model(file_id: str, # Google Drive file ID
local_path: str = 'model_weights.pt' # where to save model file
) -> None:
"Download the model weights file from Google Drive"
file_id = file_id.split('/d/')[1].split('/view')[0]
url = f'https://drive.google.com/uc?id={file_id}'
gdown.download(url, local_path, quiet = False)
Expand Down Expand Up @@ -332,12 +299,10 @@ def _deserialize_cell(cell_data: Dict[str, Any]) -> Any:
return cell_data['value']

@classmethod
def serialize(cls, df: pd.DataFrame, path: str) -> None:
"""Serialize a DataFrame to JSON with type information.

Args:
df: pandas DataFrame to serialize
path: file path to save the serialized data"""
def serialize(cls, df: pd.DataFrame, # DataFrame to serialize
path: str # file path to save serialized data
) -> None:
"Serialize a DataFrame to JSON with type information"
data = {
'columns': list(df.columns),
'index': list(df.index),
Expand All @@ -352,14 +317,9 @@ def serialize(cls, df: pd.DataFrame, path: str) -> None:
json.dump(data, f)

@classmethod
def deserialize(cls, path: str) -> pd.DataFrame:
"""Deserialize a DataFrame from JSON.

Args:
path: file path to load the serialized data from

Returns:
pandas DataFrame with restored data types"""
def deserialize(cls, path: str # file path to load serialized data
) -> pd.DataFrame: # DataFrame with restored data types
"Deserialize a DataFrame from JSON"
with open(path, 'r') as f:
data = json.load(f)

Expand Down
Loading
Loading