You are on page 1of 8

Nearest/Furthest Neighbors and K Means Clustering

YIK LUN, KEI


#### Make matrix given a distance matrix
D1=as.dist(matrix(c(0,10,7,30,29,38,42,10,0,7,23,25,34,36,7,7,0,21,22,31,36,30,23,21,0,7,
10,13,29,25,22,7,0,11,17,38,34,31,10,11,0,9,42,36,36,13,17,9,0),ncol=7))
#### Do nearest neighbor clustering
neighbors=hclust(D1, method="single",members=NULL)
#### Obtain dendogram
plot(neighbors,main="nearest neighbors")

14

8 10

Height

18

nearest neighbors

D1
hclust (*, "single")

Nearest Neighbor Clustering


#### Create the matrix with the data
datanew=matrix(c(iris$Petal.Length[c(1:50,101:150)],
iris$Petal.Width[c(1:50,101:150)], iris$Sepal.Length[c(1:50,101:150)],
iris$Sepal.Width[c(1:50,101:150)]),ncol=4)
#### Find the distance matrix
D1.b=dist(datanew)
1

#### Nearest neighbor clustering


nn=hclust(D1.b,method="single",members=NULL)
#### Extract the membership allocation
memb <- cutree(nn, k = 2)
memb
##
##
##

[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[71] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

#### Plot dendogram


plot(nn,main="Nearest neighbor of iris")
rect.hclust(nn, k=2, border="red")

2342
15
16
45
34
33
17
21
32
37
25
14
47
20
227
12
3
4
48
26
30
31
13
46
2
10
35
43
9
39
11
49
36
50
8
40
41
1
18
5
38
28
29
44
24
276
19 57
68
8260
70
59
85
86
69
56
7365
58
81
51
84
100
78
89
97
74
77
72
64
52
9366
87
99
63
90
75
71
94
91
95
92
96
54
67
88
55
79
8362
61
9853
76
80

1.0
0.0

Height

2.0

3.0

Nearest neighbor of iris

D1.b
hclust (*, "single")

Furthest Neighbor Clustering


#### Furthest neighbor clustering
fn=hclust(D1.b,method="complete",members=NULL)
#### Extract the membership allocation
memb = cutree(fn, k = 2)
memb

##
##
##

[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[71] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

#### Plot dendogram


plot(fn, main="Furthest neighbor of iris")
rect.hclust(fn, k=2, border="red")

42
30
31
26
10
35
13
2
46
36
5
38
28
29
41
1
18
50
8
40
723
43
3
4
48
14
9
39
17
33
34
15
16
6
19
21
32
37
11
49
45
47
20
22
44
24
27
12
25 57
65
72
64
52
9362
97
100
78
89
84
74
7770
85
91
95
75
71
9451
87
99
66
61
98
63
90
92
9659
54
67
88
55
79
83
58
81
53
76
80
69
56
73
68
8260
86

2
0

Height

Furthest neighbor of iris

D1.b
hclust (*, "complete")

K Means Clustering
#### Do k-means clustering, k=2. Get clusters
iris.kmeans=kmeans(datanew,2)
cluster=iris.kmeans$cluster
#### Extract the flower names of data
flowers=factor(iris[c(1:50, 101:150),5], labels=c("setosa","virginica"))
#### Figure out if there is any misclassification
table(cluster, flowers)
##
flowers
## cluster setosa virginica
##
1
50
0
##
2
0
50

#### Plot of sepal length, sepal width and classification


plot(datanew[,1], datanew[,2],col=c("red","blue")[unclass(cluster)],
pch=c(23,24)[unclass(flowers)],main="K-means of iris data containing only two species",
xlab="sepal length", ylab="sepal width")
legend("topleft",c("setosa","virginica"),pch=c(23,24) )
legend("bottomright",c("cluster 1","cluster 2"),pch=c
("R","B"),col=c("red","blue"))

2.5

Kmeans of iris data containing only two species

1.5
1.0
0.5

sepal width

2.0

setosa
virginica

R cluster 1
B cluster 2
1

sepal length

Nearest Neighbor Clustering


#### Create data matrix
datanew1=matrix(c(iris$Petal.Length,
iris$Petal.Width, iris$Sepal.Length,
iris$Sepal.Width),ncol=4)
#### Distance matrix
D1.b1=dist(datanew1)
nn.c1=hclust(D1.b1,method="single",members=NULL)
plot(nn.c1,main="nearest neighbors for iris data with 3 species")
rect.hclust(nn.c1, k=2, border="red")

0.5

23 42
15
16
45
34
33
17
21
32
37
25
14
47
20
22
7
12
3
4
48
26
30
31
13
46
2
10
3543
9
39
11
49
50
836
40
41
15
18
38
28
29
44
24
27
6
19
118
132
107
99
61
58
94 109
110
135
136
119
106
123
69
88115
63
108
131
165
20
101
60
86
7480
79
64
92
7062
81
82
54
90
89
9591
100
96
97
68
83
93
67
8556
72
77
78
87
51
53
75
98
55
66
7659
52
57
150
71
128
139
147
124
127
122
102
143 114
73
84
134
116
137
149
113
140
125
121
144
141
145
142
146
104
117
138
105
129
133
112
111
148
103
126
130

0.0

Height
1.0

1.5

nearest neighbors for iris data with 3 species

D1.b1
hclust (*, "single")

clusters.nn.c1=cutree(nn.c1, k = 3)

Furthest Neighbor Clustering

fn.d1=hclust(D1.b1,method="complete",members=NULL)
plot(fn.d1,main="Furthest neighbors for iris data with 3 species")
rect.hclust(fn.d1, k=2, border="red")

108
131
103
126
130
119
106
123
118
132
110
136
141
145
125
121
144
101
137
149
116
111
148
113
140
142
146
109
104
117
138
105
129
133
150
71
128
139
115
122
114
102
143
135
112
147
124
127
73
84
134
120
69
88
66
76
77
55
59
78
87
51
53
86
52
57
75
98
74
79
64
92
61
99
58
94
107
67
85
56
91
62
72
68
83
93
95
100
89
96
97
63
65
80
60
54
90
70
81
82
42
30
31
26
10
35
13
26
46
3
5
38
28
29
41
1
18
50
8 23
40
7
43
43
48
14
9
39
17
33
34
15
16
6
19
21
32
37
11
49
45
47
20
22
44
24
27
12
25

2
0

Height

Furthest neighbors for iris data with 3 species

D1.b1
hclust (*, "complete")
clusters.fn.d1=cutree(fn.d1, k = 3)

K Means Clustering
## Do k-means clustering, k=3. Get clusters
iris.kmeans.e1=kmeans(datanew1,3)
cluster.e1=iris.kmeans.e1$cluster
flowers.e1=factor(iris[,5], labels=c("setosa","versicolor","virginica"))
## Plot of sepal length, sepal width and classification
plot(datanew1[,1], datanew1[,2],col=c("red","green","blue")[unclass(cluster.e1)],
pch=c(23,21,24)[unclass(flowers.e1)],main="K-means of iris data containing three species",
xlab="sepal length", ylab="sepal width")
legend("topleft",c("setosa","versicolor","virginica"),pch=c
(23,21,24) )
legend("bottomright",c("cluster 1","cluster 2","cluster 3"),
pch=c("R","G","B"),col=c("red","green","blue"))

2.5

Kmeans of iris data containing three species

1.5
1.0

R cluster 1
G cluster 2
B cluster 3

0.5

sepal width

2.0

setosa
versicolor
virginica

sepal length

library(class)
normalize<-function(x) (x - min(x)) / (max(x) - min(x))
iris_n <- as.data.frame(lapply(iris[,1:4],normalize))
rows<-sample(150)
train<-iris_n[rows[1:130],]
test<-iris_n[rows[131:150],]
train_target<-iris[rows[1:130],5]
test_target<-iris[rows[131:150],5]
sqrt(dim(iris)[1])
## [1] 12.24745
m1 <- knn(train = train, test = test, cl=train_target,k=13)

Compare Clustering
## Figure out if there is any misclassification
## Extract the flower names of data
table(cluster.e1, flowers.e1)
##
flowers.e1
## cluster.e1 setosa versicolor virginica
7

##
##
##

1
2
3

50
0
0

0
48
2

0
14
36

table(clusters.nn.c1,flowers.e1)
##
flowers.e1
## clusters.nn.c1 setosa versicolor virginica
##
1
50
0
0
##
2
0
50
48
##
3
0
0
2
table(clusters.fn.d1, flowers.e1)
##
flowers.e1
## clusters.fn.d1 setosa versicolor virginica
##
1
50
0
0
##
2
0
23
49
##
3
0
27
1
table(test_target,m1)

##
m1
## test_target setosa versicolor virginica
##
setosa
8
0
0
##
versicolor
0
7
0
##
virginica
0
0
5

You might also like