计算K均值聚类的质心

时间:2017-05-10 11:12:13

标签: r

我的数据框为fllow:

.config

我打算在我的数据框上应用k-means算法:

PickUP <- data.frame(pickuplong = c(-73.93909 ,-73.94189 ,-73.93754,-73.91638,-73.92792 ,-73.88634), pickuplat =c(40.84408,40.83841,40.85311,40.84966,40.86284,40.85628)) 

现在我尝试计算每个群集的质心,如下所示:

library(maps)
library(pamr)
library(ggplot2)
library(rgdal)
library(png)
library(grid)

p <- as.data.frame(project(cbind(mdt$Pickup_longitude, mdt$Pickup_latitude), 
    proj = "+init=ESRI:54030")) 
names(p) <- c("long.rob", "lat.rob")
mdt2 <- cbind(mdt, p)
theme_map <- list(theme(panel.grid.minor = element_blank(), panel.grid.major 
             = element_blank(),panel.background = element_blank(), 
             plot.background = element_rect(fill = "#e6e8ed"), panel.border 
             = element_blank(), axis.line = element_blank(), axis.text.x = 
             element_blank(),axis.text.y = element_blank(), axis.ticks = 
             element_blank(), axis.title.x = element_blank(), axis.title.y = 
             element_blank()))  
# Spherical coordinates in radians
mdt2$long.rad <- mdt2$Pickup_longitude * (2 * pi)/360
mdt2$lat.rad <- (mdt2$Pickup_latitude * 2) * (2 * pi)/360
R <- (6378 + 6356)/2
# Cartesian coordinates
mdt2$x = R * cos(mdt2$lat.rad) * cos(mdt2$long.rad)
mdt2$y = R * cos(mdt2$lat.rad) * sin(mdt2$long.rad)
mdt2$z = R * sin(mdt2$lat.rad)
# Perform kmeans with k = 1:10
matrix <- mdt2[, colnames(mdt2) %in% c("x", "y", "z")]
models <- list()
chs <- NULL
for (c in seq(2, 10, 1)) {
  model <- kmeans(x = matrix, center = c, nstart = 50)
  models <- append(models, list(model))
  support = c
  ch = (sum(model$betweenss)/(c - 1))/(sum(model$withinss)/(sum(model$size) 
        - c))
  chs = rbind(chs, data.frame(support, ch))  }  
  # Plot the CH index
  ggplot(data = chs) + geom_line(aes(x = support, y = ch)) + 
  ggtitle("Identifying the optimal number of clusters") + 
  xlab("Number of clusters") + ylab("CH Index")
  # Select the model with the highest CH index
  best = models[[which.max(chs$ch)]]
  best_k = length(best$size)
  mdt2$cluster = best$cluster
  # Plot the clusters from the best model
  county_df <- map_data("county")
  ny <- subset(county_df, region=="new york")
  ggplot(ny) + geom_point(data=mdt2, aes(x=long.rob, y=lat.rob, color= 
  as.factor(cluster)))+
  guides(fill= FALSE) + theme(axis.text.x=element_text(angle=90, hjust=1))+
  scale_colour_discrete(name = "clusters") + xlab("Pickup longitude")+ 
  ylab("Pickup latitude")

但每次我得到不同的结果。我的错在哪里?

0 个答案:

没有答案