-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions_3.R
207 lines (158 loc) · 7.77 KB
/
functions_3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# functions for lipid heatmap #3
# This section of code only describes the Lipid Heatmap functions in MetaboLink, and do not work independently from all the other code done in MetaboLink.
# Meaning if testing of Lipid Heatmap is wanted, go to the test server linked in the B.Sc. ‘Methods and Materials’ section.
# Lipid Heatmap
# Grouping the data
process_data <- function(sequence, data) {
# Initialize an empty list to store the sample names by group and other information
results <- list()
sample_identifiers <- rownames(sequence)[sequence[, "labels"] == "Sample"]
groups <- sequence[sample_identifiers, "class"]
names <- sample_identifiers
# Extract the 'Sample' labels and corresponding 'class' from 'sequence'
sample_rows <- sequence[sequence$labels == "Sample", ]
results$sample_rows <- sample_rows # Store the sample rows in the results list
unique_groups <- unique(sample_rows$class)
results$unique_groups <- unique_groups # Store the unique groups in the results list
# Initialize an empty list within results for grouped_samples
results$grouped_samples <- list()
# Iterate over each group to get the corresponding sample names
for (group in unique_groups) {
if (!is.na(group)) {
# Get the sample names for the current group
samples_in_group <- sample_rows[sample_rows$class == group, 1]
# Add the sample names to the list, named by their group
results$grouped_samples[[paste("group", group)]] <- samples_in_group
results[[paste("Group", group)]] <- samples_in_group # Store each group separately in the results list
}
}
# Check if any of the components are NULL or empty and handle accordingly
if (length(results$grouped_samples) == 0) {
results$grouped_samples <- list(group1 = NA) # Placeholder if no groups found
}
# Return the list of results that now includes grouped samples, unique groups, sample rows, and each group
return(results)
}
# Function to create grouped data frames based on sequence and data
create_grouped_data_frames <- function(sequence, data) {
# Initialize an empty list to store data frames for each group
grouped_data_frames <- list()
# Extract the 'Sample' labels and corresponding 'class' from 'sequence'
sample_rows <- sequence[sequence$labels == "Sample", ]
unique_groups <- unique(sample_rows$class)
# Iterate over each unique group to create data frames
for (group in unique_groups) {
if (!is.na(group)) {
# Get the sample identifiers for the current group
sample_identifiers <- rownames(sample_rows)[sample_rows$class == group]
# Find the matching column indices, excluding NA values
matching_indices <- match(sample_identifiers, colnames(data))
matching_indices <- matching_indices[!is.na(matching_indices)]
# Check if we have any matching columns at all
if (length(matching_indices) > 0) {
# Select only the columns for the current group
group_data <- data[, matching_indices, drop = FALSE]
# Store the filtered data frame in the list, named by the group
grouped_data_frames[[paste("group", group)]] <- group_data
} else {
warning(paste("Group", group, "contains column names that are not in the data. Skipping this group."))
}
}
}
return(grouped_data_frames)
}
calculate_means_for_grouped_data <- function(grouped_data_frames) {
# Initialize a new list to store the modified data frames
new_grouped_data_frames <- list()
# Iterate over each group's data frame in the list
for (group_name in names(grouped_data_frames)) {
# Clone the current group's data frame to avoid modifying the original
group_data <- grouped_data_frames[[group_name]]
# Assuming the first column is not numeric and should be excluded from the mean calculation
# Calculate the mean for each row across all other columns
means <- rowMeans(group_data[, drop = FALSE], na.rm = TRUE)
# Append the calculated means as a new column to the cloned data frame
group_data$Mean <- means
# Add the modified data frame to the new list
new_grouped_data_frames[[group_name]] <- group_data
}
# Return the new list of grouped data frames with means calculated
return(new_grouped_data_frames)
}
# Function to group lipids by their class prefix (e.g., "CAR", "LP", etc.)
group_lipids_by_class <- function(data) {
# Assuming the first column of 'data' contains the lipid names like "CAR(18:1)"
lipid_names <- data[[1]] # Replace 1 with the actual column name or index if different
# Use a regular expression to extract the class prefix from lipid names
# This matches any consecutive alphabetic characters at the beginning of the string
lipid_classes <- sub("\\(([0-9]+:[0-9]+)\\).*", "", lipid_names)
# Create a data frame that maps lipid names to their class
class_mapping <- data.frame(Lipid_Name = lipid_names, Class = lipid_classes, stringsAsFactors = FALSE)
# Optionally, if you want to return a list that names each group by its class
# names(grouped_data) <- unique(lipid_classes)
return(class_mapping)
}
# Data cleaning
# Function to extract patterns from compound names
# Removes all noise from compound name, so name and length is the only left: eg. going from "CAR 14:1'CAR'[M+H]+" to "CAR 14:1"
extract_pattern <- function(name) {
# Pattern to find first part consisting of letters and numbers with a colon or a letter before the numbers
pattern <- "([A-Za-z]+\\s[0-9]+:[0-9]+)|([A-Za-z]+\\s[[:alpha:]]?-?[0-9]+:[0-9]+)"
matches <- regmatches(name, gregexpr(pattern, name))
# Returns the first match, or the hole name if no match
if (length(matches[[1]]) > 0) {
return(matches[[1]][1])
} else {
return(name)
}
}
# Function to format strings
# Puts the length and double bonds numbers into a (). Eg "CAR 14:1" to "CAR(14:1)"
format_strings <- function(input_strings) {
# Use gsub with regular expression to remove all whitespace characters
formatted_strings <- gsub("\\s+", "", input_strings)
# Add parentheses around the numbers
formatted_strings <- gsub("([A-Za-z]*)(\\d+):(\\d+)", "\\1(\\2:\\3)", formatted_strings)
return(formatted_strings)
}
### pattern_column is that corret?
# Function to filter rows based on the specified pattern, meaning removes any data that are not on X(C:D) format.
filter_data_by_pattern <- function(data) {
# Define the regular expression pattern
pattern <- "^.+\\(\\d+:\\d+\\)$"
# Check if the first column of data matches the pattern
filtered_data <- data[grepl(pattern, data[[1]]), ]
return(filtered_data)
}
#merge duplicated names of the data
merge_duplicates <- function(data) {
# Ensure the first column is treated as the Compound Name
compound_name_col <- names(data)[1]
# Group by the first column and then summarise all other columns by summing
data_merged <- data %>%
group_by(.data[[compound_name_col]]) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
return(data_merged)
}
#duplicated names have add _1, _2 and _3 depending on how many duplicates.
unique_compound_names <- function(data) {
# Ensure that 'data' is a data frame and has at least one column
if (!is.data.frame(data) || ncol(data) < 1) {
stop("The input must be a data frame with at least one column.")
}
# Apply the processing to the first column of 'data'
data[[1]] <- ave(data[[1]], data[[1]], FUN = function(x) {
if (length(x) > 1) {
# Extract the base name without parentheses
base_name <- sub("\\(.*\\)", "", x)
# Extract the part within parentheses
suffix <- sub(".*\\(", "(", x)
# Combine base name with sequence number and the part within parentheses
paste0(base_name, "_", seq_along(x), suffix)
} else {
x
}
})
# Return the modified data
return(data)
}