-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathChapter-2: Data_Structures
115 lines (108 loc) · 6.12 KB
/
Chapter-2: Data_Structures
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
##############Chapter 2 - Data Structures############################
#Vectors: One dimensional arrays that can hold numeric, character or logical data
a <- c(1 , 2, 3, 5 , 6 , -1 , 0 , 2 , 9) #Numeric Data
b <- c("Happy" , "Sad" , "Angry" , "Hungry") #Character Data
c <- c(TRUE , FALSE , TRUE , TRUE , FALSE) #Logical Data
#To refer to a Vector:
a[9]
a[5:8]
a[c(1,2,8)]
#Matrix: Two-dimensional array where each element has the same mode (numeric,character, or logical).
#Matrix Format:
### myMatrix <- matrix(vector, nrow=number_of_rows, ncol=number_of_columns,
### byrow=logical_value, dimnames=list(
### char_vector_rownames, char_vector_colnames))
rname <- c("R1" , "R2" , "R3" , "R4")
cname <- c("C1" , "C2" , "C3" , "C4")
myMatrix <- matrix(1:20 , nrow = 4 ,ncol = 4 , byrow = TRUE , dimnames = list(rname,cname))
print(myMatrix)
###To refer to a Matrix:
myMatrix[2,] #Refers to the second row
myMatrix[,3] #Refers to the third column
myMatrix[2,3]
myMatrix[2,c(2,3)] #Refers to the 2nd and 3rd elements in row 2
myMatrix[c(2,3) , 3] #Refers to the 2nd and 3rd elements in col 3
#Array: Similar to Matrix, but with more than two dimensions
### myArray <- array(vector , dimensions , dimnames=list(char_vector_dim1name, char_vector_dim2name, ..) )
dim1 = c("A1" ,"A2")
dim2 = c("B1" , "B2" , "B3")
dim3 = c("C1" , "C2" , "C3" , "C4" )
myArray <- array(1:20 , c(2,3,4) , dimnames = list(dim1 , dim2 , dim3 ))
print(myArray)
###To refer to an Array: All options same as matrices
myArray[,2,]
myArray[1,2,3]
myArray[1,c(2,3),]
#Data frames: More general than Matrix. Different columns can contain different modes of data (numeric, character , logical ,etc)
### myData <- data.frame(col1 , name2=col2 , col3 , ...) where col1, col2, col3, … are column vectors of any type (such as character, numeric, or logical). Names for each column can be provided with the names function.
patientId <- c(1 ,2 , 3 ,4)
age <- c(25 , 34 , 28 , 54)
diabetesType <- c("Type1" , "Type2" , "Type1" , "Type2")
status <- c("Poor" , "Improved" , "Excellent" , "Poor")
patientData <- data.frame(Id=patientId,age,Diabetes=diabetesType,status)
print(patientData)
## To refer to data in a data frame
patientData[2,3]
patientData[,4]
patientData[1:2]
patientData$age
patientData[c("Diabetes","status")]
table(patientData$diabetesType,patientData$age)
str(patientData)
summary(patientData)
#List: An ordered collection of objects (components). It contains a combination of vectors, matrices, data frames, and even other lists.
### myList <- list(name1 = object1, name2 = object2 , ...)
g <- "My First List"
h <- c(25, 26, 18, 39)
j <- matrix(1:10, nrow=5)
k <- c("one", "two", "three")
mylist <- list(title=g, ages=h, j, k)
print(mylist)
## To refer to data in a list
mylist[[2]] #Prints the Second component referred to by component number
mylist["ages"] #Prints the second component referred to by component name
mylist[c(2,3)] #Prints the second and third components
mylist[[c(2,3)]] #Prints the third element in the second component
mylist[[c(3,3)]] #Prints the third element in the third component
#Attach, Detach and With
##Without attach, Detach and With..
summary(mtcars$mpg)
plot(mtcars$mpg,mtcars$disp)
plot(mtcars$mpg,mtcars$wt)
#attach() function adds the data frame to the R search path. Thus we do not need to specify the data frame everytime we call a column
attach(mtcars)
summary(mpg)
plot(mpg,disp)
plot(mpg,wt)
detach() #detach() function removes the dataframe from the R search path.
#The limitations with this approach are evident when more than one object can have the same name. In that case R throws an error
#Alternative approach is to use with()
with(mtcars, {
summary(mpg,disp,wt) #This is important to assign the variables that we are going to plot in coming lines
plot(mpg,disp)
plot(mpg,wt)
})
#The limitation of with() is that the assignment exists only within the function brackets.
with(mtcars, {
a <- summary(mpg)
a #Here the variable a is assigned correctly
})
a #However, outside the variable a is not assigned and this throws error.
#If you need to create objects that will exist outside of the with() construct, use the special assignment operator <<- instead of the standard one (<-).
#It will save the object to the global environment outside of the with() call.
with(mtcars, {
stat <<- summary(mpg) #Here the variable a is assigned correctly
})
stat #However, outside the variable a is not assigned and this throws error.
###########################Factors##############################
# Categorical (nominal) and ordered categorical (ordinal) variables in R are called factors.
#The function factor() stores the categorical values as a vector of integers in the range [1... k] (where k is the number of unique values in the nominal variable), and an internal vector of character strings (the original values) mapped to these integers.
diabetes <- c("Type1", "Type2", "Type1", "Type1") #Defining a Vector
diabetes <- factor(diabetes) #stores this vector as (1, 2, 1, 1) and associates it with 1=Type1 and 2=Type2 internally (the assignment is alphabetical).
#For vectors representing ordinal variables, you add the parameter ordered=TRUE to the factor() function
status <- c("Poor" , "Improved" , "Excellent" , "Poor") #Defining the status vector
status <- factor(status, order=TRUE, #Encodes the vector as (3, 2, 1, 3) and associate these values internally as 1=Excellent, 2=Improved, and 3=Poor. By default, factor levels for character vectors are created in alphabetical order
levels=c("Poor", "Improved", "Excellent")) #You can override the default by specifying a levels option.
patientdata <- data.frame(age, diabetes, status)
str(patientdata) #Check the variable types and associated levels for factors.It clearly shows that diabetes is a factor and status is an ordered factor, along with how it’s coded internally.
summary(patientData) #provides the minimum, maximum, mean, and quartiles for the continuous variable age, and frequency counts for the categorical variables diabetes and status.