-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathk_means.cpp
More file actions
165 lines (127 loc) · 4.87 KB
/
k_means.cpp
File metadata and controls
165 lines (127 loc) · 4.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#include "k_means.h"
float euclidDistance(int dim, //no. dimensions
float *v1, //[numdims]
float *v2) //[numdims]
{
int i;
float dist = 0.0;
for (i=0; i < dim; ++i)
{
dist += (v1[i]-v2[i]) * (v1[i]-v2[i]);
}
return dist;
}
int getNearestClusterIndex(int numClusters, //no. clusters
int numDims, //no. coordinates
float *vector, //[numDims]
float **clusters) //[numClusters][numDims]
{
int index, i;
float distance, minDistance;
//init first cluster as closest to vector
index = 0;
minDistance = euclidDistance(numDims, vector, clusters[0]);
for (i = 1; i < numClusters; ++i)
{
distance = euclidDistance(numDims, vector, clusters[i]);
//square root is not computed - explained in doc
if (distance < minDistance)
{
minDistance = distance;
index = i;
}
}
return index;
}
int k_means(float **vectors, //in: [numVectors][numDims]
int numDims, //num of coordinates in vector
int numVectors,
int numClusters,
float limit, //max num of iterations
int *vectorToClusterRelevance, //out: [numVectors]
float **clusters, //out: [numClusters][numDims]
MPI_Comm comm)
{
int i, j, rank, index, loop = 0, total_numVectors;
int *newClusterSize; //[numClusters]: no. vectors assigned in each new cluster
int *clusterSize; //[numClusters]: temp buffer for reduction
float delta; //num of vectors that change their cluster
float delta_tmp;
float **newClusters; //[numClusters][numDims]
//initialize vectorToClusterRelevance[]
for (i=0; i < numVectors; ++i)
{
vectorToClusterRelevance[i] = -1;
}
//initializing newClusterSize and newClusters[0] to 0
newClusterSize = (int*) calloc(numClusters, sizeof(int));
assert(newClusterSize != NULL);
clusterSize = (int*) calloc(numClusters, sizeof(int));
assert(clusterSize != NULL);
newClusters = (float**) malloc(numClusters * sizeof(float*));
assert(newClusters != NULL);
newClusters[0] = (float*) calloc(numClusters * numDims, sizeof(float));
assert(newClusters[0] != NULL);
for (i=1; i<numClusters; ++i)
{
newClusters[i] = newClusters[i-1] + numDims;
}
//not sure it's necessary
//MPI_Allreduce(&numVectors, &total_numVectors, 1, MPI_INT, MPI_SUM, comm);
do
{
double timeStart = MPI_Wtime();
delta = 0.0;
for (i = 0; i < numVectors; ++i)
{
//find the cluster nearest to the vector
index = getNearestClusterIndex(numClusters, numDims, vectors[i], clusters);
/*if the index found doesn't match the one in
vectorToClusterRelevance[i] - vector i needs to change cluster
and delta increase delta by 1 */
if (vectorToClusterRelevance[i] != index) delta += 1.0;
//update new cluster for vector i
vectorToClusterRelevance[i] = index;
//update new cluster center by summing the added vector i
newClusterSize[index]++;
for (j = 0; j < numDims; ++j)
{
newClusters[index][j] += vectors[i][j];
}
}
/* sum all data vectors in newClusters */
MPI_Allreduce(newClusters[0], clusters[0], numClusters*numDims,
MPI_FLOAT, MPI_SUM, comm);
MPI_Allreduce(newClusterSize, clusterSize, numClusters, MPI_INT,
MPI_SUM, comm);
/* average the sum and replace old cluster centers with newClusters */
for (i=0; i<numClusters; i++) {
for (j=0; j<numDims; j++)
{
if (clusterSize[i] > 1)
clusters[i][j] /= clusterSize[i];
newClusters[i][j] = 0.0; /* set back to 0 */
}
newClusterSize[i] = 0; /* set back to 0 */
}
MPI_Allreduce(&delta, &delta_tmp, 1, MPI_FLOAT, MPI_SUM, comm);
delta = delta_tmp / total_numVectors;
double maxTime;
timeStart = MPI_Wtime() - timeStart;
MPI_Reduce(&timeStart, &maxTime, 1, MPI_DOUBLE, MPI_MAX, 0, comm);
if (rank == 0)
{
printf("%2d: loop =%d time =%f sec\n",rank,loop,timeStart);
}
}while (delta > 0.0 || loop++ < limit);
if (rank == 0)
{
printf("%2d: delta =%f limit =%f loop =%d\n",rank,delta,limit,loop);
}
//free all memory allocated
free(newClusters[0]);
free(newClusters);
free(newClusterSize);
free(clusterSize);
return 0;
}