Main Page   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

cluster.h

Go to the documentation of this file.
00001 #ifndef fl_cluster_h
00002 #define fl_cluster_h
00003 
00004 
00005 #include "fl/matrix.h"
00006 #include "fl/socket.h"
00007 #include "fl/descriptor.h"
00008 
00009 #include <iostream>
00010 #include <vector>
00011 
00012 #include <sys/types.h>
00013 #include <sys/socket.h>
00014 #include <netinet/in.h>
00015 
00016 
00017 namespace fl
00018 {
00019   // Generic clustering interface ---------------------------------------------
00020 
00021   class ClusterMethod
00022   {
00023   public:
00024         virtual void          run (const std::vector< Vector<float> > & data) = 0;  
00025         virtual int           classify (const Vector<float> & point) = 0;  
00026         virtual Vector<float> distribution (const Vector<float> & point) = 0;  
00027         virtual int           classCount () = 0;  
00028         virtual Vector<float> representative (int group) = 0;  
00029 
00030         // The read () and write () methods should serialize enough data to either
00031         // resume clustering with a call to run () or to answer cluster queries via
00032         // classify () and representative ().
00033         virtual void read (std::istream & stream);
00034         virtual void write (std::ostream & stream, bool withName = false);
00035 
00036         bool stop;  
00037   };
00038 
00039 
00040   // Soft K-means -------------------------------------------------------------
00041 
00042   class ClusterGauss
00043   {
00044   public:
00045         ClusterGauss (Vector<float> & center, float alpha = 1.0);
00046         ClusterGauss (Vector<float> & center, Matrix<float> & covariance, float alpha = 1.0);
00047         ClusterGauss (std::istream & stream);  
00048         ~ClusterGauss ();
00049 
00050         void prepareInverse ();  
00051         float probability (const Vector<float> & point, float * scale = NULL, float * minScale = NULL);  
00052         void read (std::istream & stream);
00053         void write (std::ostream & stream);
00054 
00055         float alpha;
00056         Vector<float> center;
00057         Matrix<float> covariance;
00058         Matrix<float> eigenvectors;
00059         Vector<float> eigenvalues;
00060         Matrix<float> eigenverse;
00061         float det;  
00062   };
00063 
00064   class KMeans : public ClusterMethod
00065   {
00066   public:
00067         KMeans (float maxSize, float minSize, int initialK, int maxK, const std::string & clusterFileName = "");  
00068         KMeans (std::istream & stream, const std::string & clusterFileName = "");  
00069 
00070         virtual void          run (const std::vector< Vector<float> > & data);
00071         virtual int           classify (const Vector<float> & point);
00072         virtual Vector<float> distribution (const Vector<float> & point);
00073         virtual int           classCount ();
00074         virtual Vector<float> representative (int group);
00075         virtual void          read (std::istream & stream);
00076         virtual void          write (std::ostream & stream, bool withName = false);
00077 
00078         void initialize (const std::vector< Vector<float> > & data);
00079         void estimate (const std::vector< Vector<float> > & data, Matrix<float> & member, int jbegin, int jend);
00080         float maximize (const std::vector< Vector<float> > & data, const Matrix<float> & member, int i);
00081         bool convergence (const std::vector< Vector<float> > & data, const Matrix<float> & member, float largestChange);
00082 
00083         // State of clustering process
00084         float maxSize;  
00085         float minSize;  
00086         int initialK;  
00087         int maxK;  
00088         std::vector<ClusterGauss> clusters;
00089         std::vector<float> changes;
00090         std::vector<float> velocities;
00091 
00092         // Control information
00093         std::string clusterFileName;
00094         time_t clusterFileTime;  
00095         off_t clusterFileSize;
00096   };
00097 
00098   class KMeansParallel : public KMeans
00099   {
00100   public:
00101         KMeansParallel (float maxSize, float minSize, int initialK, int maxK, const std::string & clusterFileName = "");
00102         KMeansParallel (std::istream & stream, const std::string & clusterFileName = "");
00103 
00104         virtual void run (const std::vector< Vector<float> > & data);
00105 
00106         static void * listenThread (void * arg);
00107         static void * proxyThread (void * arg);
00108         void client (std::string serverName);
00109 
00110         // Shared state for parallel processing
00111         int iteration;
00112         const std::vector<Vector<float> > * data;
00113         Matrix<float> member;
00114         float largestChange;
00115         enum EMstate
00116         {
00117           initializing,
00118           estimating,
00119           maximizing,
00120           checking
00121         };
00122         EMstate          state;
00123         pthread_mutex_t  stateLock;  
00124         std::vector<int> workUnits;  
00125         int              unitsPending;  
00126 
00127         struct ThreadDataHolder
00128         {
00129           KMeansParallel * kmeans;
00130           int connection;
00131           struct sockaddr_in peer;
00132         };
00133   };
00134 
00135   // Find a more elegant way to disseminate these constants!
00136 #define workUnitSize 1000
00137 #define portNumber   60000
00138   const float smallestNormalFloat = 1e-38;
00139   const float largestNormalFloat = 1e38;
00140   const float largestDistanceFloat = 87;  
00141 
00142 
00143   // Kohonen map --------------------------------------------------------------
00144 
00145   class ClusterCosine
00146   {
00147   public:
00148         ClusterCosine (int dimension);
00149         ClusterCosine (Vector<float> & center);
00150         ClusterCosine (std::istream & stream);
00151 
00152         float distance (const Vector<float> & point);
00153         float update (const Vector<float> & point, float weight);
00154 
00155         void read (std::istream & stream);
00156         void write (std::ostream & stream);
00157 
00158         Vector<float> center;
00159   };
00160 
00161   class Kohonen : public ClusterMethod
00162   {
00163   public:
00164         Kohonen (int width, float sigma = 1.0, float learningRate = 0.1, float decayRate = 0.5);
00165         Kohonen (std::istream & stream);
00166 
00167         virtual void          run (const std::vector< Vector<float> > & data);
00168         virtual int           classify (const Vector<float> & point);
00169         virtual Vector<float> distribution (const Vector<float> & point);
00170         virtual int           classCount ();
00171         virtual Vector<float> representative (int group);
00172         virtual void          read (std::istream & stream);
00173         virtual void          write (std::ostream & stream, bool withName = false);
00174 
00175         std::vector<ClusterCosine> map;
00176         int width;  
00177         float sigma;  
00178         float learningRate;  
00179         float decayRate;  
00180   };
00181 
00182 
00183   // Agglomerative clustering -------------------------------------------------
00184 
00185   class ClusterAgglomerative
00186   {
00187   public:
00188         ClusterAgglomerative (const Vector<float> & center, int count = 1);
00189         ClusterAgglomerative (std::istream & stream);
00190 
00191         void operator += (const ClusterAgglomerative & that);
00192         void read (std::istream & stream);
00193         void write (std::ostream & stream);
00194 
00195         Vector<float> center;
00196         int count;  
00197   };
00198 
00199   class Agglomerate : public ClusterMethod
00200   {
00201   public:
00202         Agglomerate (Comparison * comparison, float distanceLimit, int minClusters = 1);
00203         Agglomerate (std::istream & stream);
00204         ~Agglomerate ();
00205 
00206         virtual void          run (const std::vector< Vector<float> > & data);
00207         virtual int           classify (const Vector<float> & point);
00208         virtual Vector<float> distribution (const Vector<float> & point);
00209         virtual int           classCount ();
00210         virtual Vector<float> representative (int group);
00211         virtual void read (std::istream & stream);
00212         virtual void write (std::ostream & stream, bool withName = false);
00213 
00214         Comparison * comparison;
00215         float distanceLimit;  
00216         int minClusters;  
00217         std::vector<ClusterAgglomerative *> clusters;
00218   };
00219 }
00220 
00221 
00222 #endif

Generated on Thu Dec 9 17:13:24 2004 for fl by doxygen1.2.18