-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathWednesday3.R
149 lines (112 loc) · 3.92 KB
/
Wednesday3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# Date: Wednesday, August 8, 2012
###################
### Data frames ###
###################
# Data sets usually are given as data matrices,
# simple text files, where
# - each row corresponds to an observation
# - each column corresponds to a "variable"
#
# Example: StatWiSo2003.txt
ds <- read.table(file="StatWiSo2003.txt")
str(ds)
# Something's wrong; the first line was interpreted
# as an observation!
ds <- read.table(file="StatWiSo2003.txt",header=TRUE)
# "header=TRUE" means: The first line contains
# the names of variables!
ds <- read.table(file="StatWiSo2003.txt",header=TRUE,sep="\t")
# "sep="\t"" means: Columns are separated by tabulators.
# To see, what ds contains:
ds # Well, too many numbers :-(
str(ds) # str(ucture) of ds
# "data.frame" is a special type of data structure:
# It is a list, where all elements are one-dimensional
# arrays of the same length.
# "factor": a categorical variable
# "int", "num" : a numerical variable
# To convert a variable of type "int" into a categorical
# variable, use factor:
ds$ZufZiffer <- factor(ds$ZufZiffer)
ds$ZufZiffer <- factor(ds$ZufZiffer,levels=0:9)
# also months are categorical
ds$GebMonat <- factor(ds$GebMonat,levels=1:12)
# Analysing single factors and two factors simultaneously:
# Compute absolute frequencies:
table(ds$Herkunft)
table(ds$Rauchen)
table(ds$Geschlecht)
# Contingency table for two factors:
table(ds$Rauchen,ds$Geschlecht)
# Pie charts:
pie(table(ds$Rauchen))
pie(table(ds$ZufZiffer))
# Pie chart and bar plot in one window:
par(mfrow=c(2,1))
pie(table(ds$ZufZiffer))
barplot(table(ds$ZufZiffer))
# Hmm, the sizes are a little strange!
# Try to fix this with parameter "mai":
par(mfrow=c(2,1),mai=c(0.5,0.4,0.2,0))
pie(table(ds$ZufZiffer))
barplot(table(ds$ZufZiffer))
# Hmm, somewhat better, but...
? pie
# Try to use parameter "radius" of pie:
par(mfrow=c(2,1),mai=c(0.5,0.4,0.2,0))
pie(table(ds$ZufZiffer),radius=1)
barplot(table(ds$ZufZiffer))
# Yesss!
# If we want to set and match colors:
col<-c("red","green","blue","yellow","cyan")
par(mfrow=c(2,1),mai=c(0.5,0.4,0.2,0))
pie(table(ds$ZufZiffer),radius=1,col=col)
barplot(table(ds$ZufZiffer),col=col)
# Can be accessed like a matrix
# Accessing particular components of a data frame:
ds[3,4] # value of variable 4 for observation no. 3.
ds$Herkunft[3]
ds[3,] # all entries for observation no. 3
ds$Alter # one particular variable
# One or several columns of the data frame:
ds[,2:4]
ds[,c(1,2,4,9)]
ds$MonMiete > 0
# gives a vector of logicals, TRUE, FALSE or NA
# How many students in the sample do pay rent?
sum(ds$MonMiete > 0)
# The entries 'NA' cause problems, but
sum(ds$MonMiete > 0, na.rm=TRUE)
# gives the number of entries == TRUE, ignoring NA
# TRUE = 1, FALSE = 0.
# How many students in the sample
# don't have to pay rent?
sum(ds$MonMiete == 0, na.rm=TRUE)
# Checking equalities with "==" not with "=" !
# With the command "ds$MonMiete = 0" you would set
# all entries in ds$MonMiete to zero !
# How many missing values:
sum(is.na(ds$MonMiete))
# Another way to obtain this information:
table(ds$MonMiete>0)
# Some stuff with Boxplots:
### Box-(and-Whiskers)-Plots
# careful, even less precise than histogram!
# The box signifies the quantils:
# - lower line: the 25% quantile (1. quartile)
# - middle(fat) line: the 50% quantile (medium, 2. quartile)
# - upper line: the 75 % quantile (3. quartile)
# stroke stuff:
# - upper line: largest observation which is <= 3. quartile + 1.5*Interquartilerange
# - lower line: analog
# the dots:
# - "very large/small observations"
boxplot(ds$Alter ~ ds[["Geschlecht"]])
boxplot(ds[["Alter"]] ~ ds[["Rauchen"]])
# wilcox test for seeing if difference is significant
wilcox.test(ds[["Alter"]] ~ ds[["Geschlecht"]])
wilcox.test(ds[["Alter"]] ~ ds[["Rauchen"]])
# One way to solve this problem:
# Replace the group variable with a binary one:
boxplot(ds[["Alter"]] ~ pmin(as.numeric(ds[["Rauchen"]]), 1))
wilcox.test(ds[["Alter"]] ~ pmin(as.numeric(ds[["Rauchen"]]), 1))