K-means算法C ++,交换点数?

时间:2015-12-02 06:34:48

标签: c++ k-means centroid

对于我的任务,我必须对600个数据点应用K-means算法,这些数据点都有60个维度(或属性,如果你愿意的话)。我将600个数据点存储到6个集群中(所以K = 6),这就是我到目前为止所拥有的:

#include <iostream> //std::cout
#include <fstream> //std::ifstream
#include <string> //std::string
#include <sstream> //std::istringstream
#include <vector> //std::vector
#include <cmath> //std::cmath
#include <array> //std::array

#define K 6
#define MAX_ITERATIONS 10000000
#define NUM_ATTRIBUTES 60

struct Point{
        std::array<double, NUM_ATTRIBUTES> point;
        std::string classType;
};

struct Cluster{
        std::vector<Point> points;
        Point centroid;
};

int randNumGenerator(int max){
        int num = (rand() % max);
        return num;
}

void setData(Point p, std::string line, std::vector<Point> &data, int index){
        std::stringstream s(line);
        std::string classes[] = {"Normal", "Cyclic", "Increasing trend", 
                                "Decreasing trend", "Upward shift", "Downward shift"};

        for(int i = 0; i < NUM_ATTRIBUTES; i++){
                double num;
                if(s >> num){
                        p.point[i] = num;
                }
        }

        if(index > 0 && index <= 100){
                p.classType = classes[0];
        }
        else if(index > 100 && index <= 200){
                p.classType = classes[1];
        }
        else if(index > 200 && index <= 300){
                p.classType = classes[2];
        }
        else if(index > 300 && index <= 400){
                p.classType = classes[3];
        }
        else if(index > 400 && index <= 500){
                p.classType = classes[4];
        }
        else if(index > 500 && index <= 600){
                p.classType = classes[5];
        }
        data.push_back(p);
}

void initializeCentroids(std::vector<Point> &points, int num_clusters,
                                std::vector<Point> &centroids){
        Point p;
        std::vector<bool> numsUsedAlready(points.size());
        for(int i = 0; i < num_clusters; i++){
                int randNum = randNumGenerator(points.size());
                while(numsUsedAlready[randNum]){
                        randNum = randNumGenerator(points.size());
                }
                numsUsedAlready[randNum] = true;
                p = points[randNum];
                centroids.push_back(p);
        }
}

double calculateDistance(Point p, Point centroid){
        double ret = 0;
        for(int i = 0; i < p.point.size(); i++){
                double distance = p.point[i] - centroid.point[i];
                ret += distance * distance;
        }
        return sqrt(ret);
}

void setCentroids(std::vector<Cluster> &clusters, std::vector<Point> centroids){
        for(int i = 0; i < centroids.size(); i++){
                Cluster c;
                c.points.push_back(centroids[i]);
                c.centroid = centroids[i];
                clusters.push_back(c);
        }
}

void setClusters(std::vector<Cluster> &clusters, std::vector<Point> points){
        int sendToCluster = 99999999;
        for(int index = 0; index < points.size(); index++){
                double minDist = 99999999999;
                Point p = points[index];
                for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){
                        double tempDist = calculateDistance(p, clusters[clusterNum].centroid);
                        //std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl;
                        if(tempDist < minDist){
                                minDist = tempDist;
                                sendToCluster = clusterNum;
                        }
                }
                //std::cout << "Pushing  to clusterNUm " << sendToCluster << std::endl;
                clusters[sendToCluster].points.push_back(p);
        }
}

void updateCentroid(std::vector<Cluster> &clusters){
        for(int i = 0; i < clusters.size(); i++){
                Cluster c = clusters[i];
                for(int j = 0; j < NUM_ATTRIBUTES; j++){
                        double avg = 0;
                        for(int h = 0; h < c.points.size(); h++){
                                Point p = c.points[h];
                                avg += p.point[j];
                        }
                        double oldCentroidValue = c.centroid.point[j];
                        c.centroid.point[j] = avg / c.points.size();
                        std::cout << "old: " << oldCentroidValue << " new: " << c.centroid.point[j] << std::endl;
                }
        }
}

void k_clustering(std::vector<Point> &points, int num_clusters){
        std::vector<Point> centroids;
        initializeCentroids(points, num_clusters, centroids);
        std::vector<Cluster> clusters;
        setCentroids(clusters, centroids);

        /*for(int i = 0; i < clusters.size(); i++){
                for(int j = 0; j < NUM_ATTRIBUTES; j++){
                        std::cout << clusters[i].centroid.point[j] << " ";
                }
                std::cout << std::endl;
        }
        */

        setClusters(clusters, points);
        updateCentroid(clusters);
        for(int i = 0; i < clusters.size(); i++){
                std::cout << i << " " << clusters[i].points.size() << std::endl;
        }
}

void readInputFile(std::ifstream &file, std::vector<Point> &data){
        std::string line;
        int counter = 0;
        while(getline(file,line)){
                counter++;
                Point p;
                setData(p, line, data, counter);
        }
}

void usageString(){
        std::cout << "Usage: output <input_file>" << std::endl;
}

char* checkFile(int argc, char *argv[]){
        if(argc < 2){
                usageString();
                exit(0);
        }
        else{
                char *inputfile = argv[1];
                return inputfile;
        }
}

int main(int argc, char *argv[]){
        char *inputfile;
        inputfile = checkFile(argc, argv);

        std::ifstream input(inputfile);
        if(!input.is_open()){
                std::cerr << "Error: Data file doesn't exist" << std::endl;
                return EXIT_FAILURE;
        }
        srand(time(NULL));
        std::vector<Point> data;
        readInputFile(input, data);
        k_clustering(data, K);
        //printData(data);
        //Attribute closest cluster to each data point
          //For each point, calculate distance to each centroid
            //Assign point to cluster (and update centroid value, by finding mean values of all points)
          //Repeat until nothing changed or after a certain number of iterations

        return 1;
}

然而,我注意到我的代码只能在一次迭代中正常工作。但是在第一次迭代之后,在我的setClusters&#39;功能,我推进了一个点。我必须将Point转移到另一个群集(如果需要),但是如果不采取从该群集中删除该Point然后将其推送到另一个群集的漫长路径,我会对如何做到这一点感到困惑。我确信有一种更有效的方法可以将Point换成另一个群集。

1 个答案:

答案 0 :(得分:0)

我没有时间浏览整个代码,但如果您对优化setClusters()感兴趣,那么也许您可以尝试C ++的移动语义和{ {1}} vector

emplace_back()

另一个想法是void setClusters(std::vector<Cluster> &clusters, std::vector<Point> points){ int sendToCluster = 99999999; for(int index = 0; index < points.size(); index++){ double minDist = 99999999999; Point& p = points[index]; // use a reference to avoid copying, unless you don't want to alter the original "points" vector for(int clusterNum = 0; clusterNum < clusters.size(); clusterNum++){ double tempDist = calculateDistance(p, clusters[clusterNum].centroid); //std::cout << "dist: " << tempDist << " clusterNum: " << clusterNum << std::endl; if(tempDist < minDist){ minDist = tempDist; sendToCluster = clusterNum; } } //std::cout << "Pushing to clusterNUm " << sendToCluster << std::endl; clusters[sendToCluster].points .emplace_back( // construct the new item in place std::move(p) // by "moving" from p ); } points.clear(); // clear the input vector, unless you don't want to alter it. cf. the comment on using references } s持有&#34;指数&#34;而不是实际的&#34;点&#34;。在这种情况下,您可以将结构声明为:

Cluster

然后,在struct Cluster{ std::vector<int> pointIndices; Point centroid; }; 中,您可以推送索引:

setClusters()

我希望它有所帮助。

相关问题