00001 #ifndef fl_cluster_h
00002 #define fl_cluster_h
00003
00004
00005 #include "fl/matrix.h"
00006 #include "fl/socket.h"
00007 #include "fl/descriptor.h"
00008
00009 #include <iostream>
00010 #include <vector>
00011
00012 #include <sys/types.h>
00013 #include <sys/socket.h>
00014 #include <netinet/in.h>
00015
00016
00017 namespace fl
00018 {
00019
00020
00021 class ClusterMethod
00022 {
00023 public:
00024 virtual void run (const std::vector< Vector<float> > & data) = 0;
00025 virtual int classify (const Vector<float> & point) = 0;
00026 virtual Vector<float> distribution (const Vector<float> & point) = 0;
00027 virtual int classCount () = 0;
00028 virtual Vector<float> representative (int group) = 0;
00029
00030
00031
00032
00033 virtual void read (std::istream & stream);
00034 virtual void write (std::ostream & stream, bool withName = false);
00035
00036 bool stop;
00037 };
00038
00039
00040
00041
00042 class ClusterGauss
00043 {
00044 public:
00045 ClusterGauss (Vector<float> & center, float alpha = 1.0);
00046 ClusterGauss (Vector<float> & center, Matrix<float> & covariance, float alpha = 1.0);
00047 ClusterGauss (std::istream & stream);
00048 ~ClusterGauss ();
00049
00050 void prepareInverse ();
00051 float probability (const Vector<float> & point, float * scale = NULL, float * minScale = NULL);
00052 void read (std::istream & stream);
00053 void write (std::ostream & stream);
00054
00055 float alpha;
00056 Vector<float> center;
00057 Matrix<float> covariance;
00058 Matrix<float> eigenvectors;
00059 Vector<float> eigenvalues;
00060 Matrix<float> eigenverse;
00061 float det;
00062 };
00063
00064 class KMeans : public ClusterMethod
00065 {
00066 public:
00067 KMeans (float maxSize, float minSize, int initialK, int maxK, const std::string & clusterFileName = "");
00068 KMeans (std::istream & stream, const std::string & clusterFileName = "");
00069
00070 virtual void run (const std::vector< Vector<float> > & data);
00071 virtual int classify (const Vector<float> & point);
00072 virtual Vector<float> distribution (const Vector<float> & point);
00073 virtual int classCount ();
00074 virtual Vector<float> representative (int group);
00075 virtual void read (std::istream & stream);
00076 virtual void write (std::ostream & stream, bool withName = false);
00077
00078 void initialize (const std::vector< Vector<float> > & data);
00079 void estimate (const std::vector< Vector<float> > & data, Matrix<float> & member, int jbegin, int jend);
00080 float maximize (const std::vector< Vector<float> > & data, const Matrix<float> & member, int i);
00081 bool convergence (const std::vector< Vector<float> > & data, const Matrix<float> & member, float largestChange);
00082
00083
00084 float maxSize;
00085 float minSize;
00086 int initialK;
00087 int maxK;
00088 std::vector<ClusterGauss> clusters;
00089 std::vector<float> changes;
00090 std::vector<float> velocities;
00091
00092
00093 std::string clusterFileName;
00094 time_t clusterFileTime;
00095 off_t clusterFileSize;
00096 };
00097
00098 class KMeansParallel : public KMeans
00099 {
00100 public:
00101 KMeansParallel (float maxSize, float minSize, int initialK, int maxK, const std::string & clusterFileName = "");
00102 KMeansParallel (std::istream & stream, const std::string & clusterFileName = "");
00103
00104 virtual void run (const std::vector< Vector<float> > & data);
00105
00106 static void * listenThread (void * arg);
00107 static void * proxyThread (void * arg);
00108 void client (std::string serverName);
00109
00110
00111 int iteration;
00112 const std::vector<Vector<float> > * data;
00113 Matrix<float> member;
00114 float largestChange;
00115 enum EMstate
00116 {
00117 initializing,
00118 estimating,
00119 maximizing,
00120 checking
00121 };
00122 EMstate state;
00123 pthread_mutex_t stateLock;
00124 std::vector<int> workUnits;
00125 int unitsPending;
00126
00127 struct ThreadDataHolder
00128 {
00129 KMeansParallel * kmeans;
00130 int connection;
00131 struct sockaddr_in peer;
00132 };
00133 };
00134
00135
00136 #define workUnitSize 1000
00137 #define portNumber 60000
00138 const float smallestNormalFloat = 1e-38;
00139 const float largestNormalFloat = 1e38;
00140 const float largestDistanceFloat = 87;
00141
00142
00143
00144
00145 class ClusterCosine
00146 {
00147 public:
00148 ClusterCosine (int dimension);
00149 ClusterCosine (Vector<float> & center);
00150 ClusterCosine (std::istream & stream);
00151
00152 float distance (const Vector<float> & point);
00153 float update (const Vector<float> & point, float weight);
00154
00155 void read (std::istream & stream);
00156 void write (std::ostream & stream);
00157
00158 Vector<float> center;
00159 };
00160
00161 class Kohonen : public ClusterMethod
00162 {
00163 public:
00164 Kohonen (int width, float sigma = 1.0, float learningRate = 0.1, float decayRate = 0.5);
00165 Kohonen (std::istream & stream);
00166
00167 virtual void run (const std::vector< Vector<float> > & data);
00168 virtual int classify (const Vector<float> & point);
00169 virtual Vector<float> distribution (const Vector<float> & point);
00170 virtual int classCount ();
00171 virtual Vector<float> representative (int group);
00172 virtual void read (std::istream & stream);
00173 virtual void write (std::ostream & stream, bool withName = false);
00174
00175 std::vector<ClusterCosine> map;
00176 int width;
00177 float sigma;
00178 float learningRate;
00179 float decayRate;
00180 };
00181
00182
00183
00184
00185 class ClusterAgglomerative
00186 {
00187 public:
00188 ClusterAgglomerative (const Vector<float> & center, int count = 1);
00189 ClusterAgglomerative (std::istream & stream);
00190
00191 void operator += (const ClusterAgglomerative & that);
00192 void read (std::istream & stream);
00193 void write (std::ostream & stream);
00194
00195 Vector<float> center;
00196 int count;
00197 };
00198
00199 class Agglomerate : public ClusterMethod
00200 {
00201 public:
00202 Agglomerate (Comparison * comparison, float distanceLimit, int minClusters = 1);
00203 Agglomerate (std::istream & stream);
00204 ~Agglomerate ();
00205
00206 virtual void run (const std::vector< Vector<float> > & data);
00207 virtual int classify (const Vector<float> & point);
00208 virtual Vector<float> distribution (const Vector<float> & point);
00209 virtual int classCount ();
00210 virtual Vector<float> representative (int group);
00211 virtual void read (std::istream & stream);
00212 virtual void write (std::ostream & stream, bool withName = false);
00213
00214 Comparison * comparison;
00215 float distanceLimit;
00216 int minClusters;
00217 std::vector<ClusterAgglomerative *> clusters;
00218 };
00219 }
00220
00221
00222 #endif