####density estimation ###Simulation data phi = function(x){ 0.8*dnorm(x,sd=0.5) + 0.2*dnorm(x,mean=2,sd=0.3) } x = seq(-5,5,by=0.01) y = sapply(x,FUN=phi) plot(x,y,type="l") x = c() for(i in 1:1000){ if(runif(1)<=0.8) x[i] = rnorm(1,sd=0.5) else x[i] = rnorm(1,mean=2,sd=0.3) } hist(x) dvals = rep(0,length(x)) h = 0.2 x = sort(x) for(i in 1:length(x)){ dvals[i] = dvals[i] + sum(xx[i]-h)/(2*h*length(x)) } plot(x,dvals,col="blue",type="l",lwd=2) dens = density(x); plot(dens,col="blue",lwd=3); dvals = rep(0,length(dens$x)) for(i in 1:length(x)){ dvals = dvals + dnorm(dens$x,mean=x[i],sd=dens$bw)/length(x) } plot(dens,col="red",lwd=3); points(dens$x,dvals,col="blue",pch=19,cex=0.5) ####real data install.packages("bootstrap") library(bootstrap) data(stamp) str(stamp) thick = stamp$Thickness dens = density(thick); plot(dens,col="blue",lwd=3); dvals = rep(0,length(dens$x)) for(i in 1:length(thick)){ dvals = dvals + dnorm(dens$x,mean=thick[i],sd=dens$bw)/length(thick) } plot(dens,col="red",lwd=3); points(dens$x,dvals,col="blue",pch=19,cex=0.5) ####Hierarchical clustering - example install.packages("fields") set.seed(1234); par(mar=c(0,0,0,0)) x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2) y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2) plot(x,y,col="blue",pch=19,cex=2) text(x+0.05,y+0.05,labels=as.character(1:12)) dataFrame <- data.frame(x=x,y=y) dist(dataFrame) suppressMessages(library(fields)) dataFrame <- data.frame(x=x,y=y) rdistxy <- rdist(dataFrame) diag(rdistxy) <- diag(rdistxy) + 1e5 ### 1st step # Find the index of the points with minimum distance ind <- which(rdistxy == min(rdistxy),arr.ind=TRUE) par(mfrow=c(1,2),mar=rep(0.2,4)) # Plot the points with the minimum overlayed plot(x,y,col="blue",pch=19,cex=2) text(x+0.05,y+0.05,labels=as.character(1:12)) points(x[ind[1,]],y[ind[1,]],col="orange",pch=19,cex=2) # Make a cluster and cut it at the right height distxy <- dist(dataFrame) hcluster <- hclust(distxy) dendro <- as.dendrogram(hcluster) cutDendro <- cut(dendro,h=(hcluster$height[1]+0.00001) ) plot(cutDendro$lower[[11]],yaxt="n") ###2nd step library(fields) dataFrame <- data.frame(x=x,y=y) rdistxy <- rdist(dataFrame) diag(rdistxy) <- diag(rdistxy) + 1e5 # Find the index of the points with minimum distance ind <- which(rdistxy == min(rdistxy),arr.ind=TRUE) par(mar=rep(0.2,4)) # Plot the points with the minimum overlayed plot(x,y,col="blue",pch=19,cex=2) text(x+0.05,y+0.05,labels=as.character(1:12)) points(x[ind[1,]],y[ind[1,]],col="orange",pch=19,cex=2) points(mean(x[ind[1,]]),mean(y[ind[1,]]),col="black",cex=3,lwd=3,pch=3) points(mean(x[ind[1,]]),mean(y[ind[1,]]),col="orange",cex=5,lwd=3,pch=1) ###3rd step library(fields) dataFrame <- data.frame(x=x,y=y) rdistxy <- rdist(dataFrame) diag(rdistxy) <- diag(rdistxy) + 1e5 # Find the index of the points with minimum distance ind <- which(rdistxy == rdistxy[order(rdistxy)][3],arr.ind=TRUE) par(mfrow=c(1,3),mar=rep(0.2,4)) # Plot the points with the minimum overlayed plot(x,y,col="blue",pch=19,cex=2) text(x+0.05,y+0.05,labels=as.character(1:12)) points(x[c(5,6)],y[c(5,6)],col="orange",pch=19,cex=2) points(x[ind[1,]],y[ind[1,]],col="red",pch=19,cex=2) # Make dendogram plots distxy <- dist(dataFrame) hcluster <- hclust(distxy) dendro <- as.dendrogram(hcluster) cutDendro <- cut(dendro,h=(hcluster$height[2]) ) plot(cutDendro$lower[[10]],yaxt="n") plot(cutDendro$lower[[5]],yaxt="n") ###Hierarchical clustering - hclust dataFrame <- data.frame(x=x,y=y) distxy <- dist(dataFrame) hClustering <- hclust(distxy) plot(hClustering) ###heatmap dataFrame <- data.frame(x=x,y=y) set.seed(143) dataMatrix <- as.matrix(dataFrame)[sample(1:12),] heatmap(dataMatrix) #####K-means step by step set.seed(1234); par(mar=c(0,0,0,0)) x <- rnorm(12,mean=rep(1:3,each=4),sd=0.2) y <- rnorm(12,mean=rep(c(1,2,1),each=4),sd=0.2) plot(x,y,col="blue",pch=19,cex=2) text(x+0.05,y+0.05,labels=as.character(1:12)) #### K-means clustering - starting centroids par(mar=rep(0.2,4)) plot(x,y,col="blue",pch=19,cex=2) text(x+0.05,y+0.05,labels=as.character(1:12)) cx <- c(1,1.8,2.5) cy <- c(2,1,1.5) points(cx,cy,col=c("red","orange","purple"),pch=3,cex=2,lwd=2) #### K-means clustering - assign to closest centroid par(mar=rep(0.2,4)) plot(x,y,col="blue",pch=19,cex=2) cols1 <- c("red","orange","purple") text(x+0.05,y+0.05,labels=as.character(1:12)) cx <- c(1,1.8,2.5) cy <- c(2,1,1.5) points(cx,cy,col=cols1,pch=3,cex=2,lwd=2) ## Find the closest centroid distTmp <- matrix(NA,nrow=3,ncol=12) distTmp[1,] <- (x-cx[1])^2 + (y-cy[1])^2 distTmp[2,] <- (x-cx[2])^2 + (y-cy[2])^2 distTmp[3,] <- (x-cx[3])^2 + (y-cy[3])^2 newClust <- apply(distTmp,2,which.min) points(x,y,pch=19,cex=2,col=cols1[newClust]) ###### K-means clustering - recalculate centroids par(mar=rep(0.2,4)) plot(x,y,col="blue",pch=19,cex=2) cols1 <- c("red","orange","purple") text(x+0.05,y+0.05,labels=as.character(1:12)) ## Find the closest centroid distTmp <- matrix(NA,nrow=3,ncol=12) distTmp[1,] <- (x-cx[1])^2 + (y-cy[1])^2 distTmp[2,] <- (x-cx[2])^2 + (y-cy[2])^2 distTmp[3,] <- (x-cx[3])^2 + (y-cy[3])^2 newClust <- apply(distTmp,2,which.min) points(x,y,pch=19,cex=2,col=cols1[newClust]) newCx <- tapply(x,newClust,mean) newCy <- tapply(y,newClust,mean) ## Old centroids cx <- c(1,1.8,2.5) cy <- c(2,1,1.5) points(newCx,newCy,col=cols1,pch=3,cex=2,lwd=2) #####K-means clustering - reassign values par(mar=rep(0.2,4)) plot(x,y,col="blue",pch=19,cex=2) cols1 <- c("red","orange","purple") text(x+0.05,y+0.05,labels=as.character(1:12)) cx <- c(1,1.8,2.5) cy <- c(2,1,1.5) ## Find the closest centroid distTmp <- matrix(NA,nrow=3,ncol=12) distTmp[1,] <- (x-cx[1])^2 + (y-cy[1])^2 distTmp[2,] <- (x-cx[2])^2 + (y-cy[2])^2 distTmp[3,] <- (x-cx[3])^2 + (y-cy[3])^2 newClust <- apply(distTmp,2,which.min) newCx <- tapply(x,newClust,mean) newCy <- tapply(y,newClust,mean) ## Old centroids points(newCx,newCy,col=cols1,pch=3,cex=2,lwd=2) ## Iteration 2 distTmp <- matrix(NA,nrow=3,ncol=12) distTmp[1,] <- (x-newCx[1])^2 + (y-newCy[1])^2 distTmp[2,] <- (x-newCx[2])^2 + (y-newCy[2])^2 distTmp[3,] <- (x-newCx[3])^2 + (y-newCy[3])^2 newClust2 <- apply(distTmp,2,which.min) points(x,y,pch=19,cex=2,col=cols1[newClust2]) ##### K-means clustering - update centroids par(mar=rep(0.2,4)) plot(x,y,col="blue",pch=19,cex=2) cols1 <- c("red","orange","purple") text(x+0.05,y+0.05,labels=as.character(1:12)) cx <- c(1,1.8,2.5) cy <- c(2,1,1.5) ## Find the closest centroid distTmp <- matrix(NA,nrow=3,ncol=12) distTmp[1,] <- (x-cx[1])^2 + (y-cy[1])^2 distTmp[2,] <- (x-cx[2])^2 + (y-cy[2])^2 distTmp[3,] <- (x-cx[3])^2 + (y-cy[3])^2 newClust <- apply(distTmp,2,which.min) newCx <- tapply(x,newClust,mean) newCy <- tapply(y,newClust,mean) ## Iteration 2 distTmp <- matrix(NA,nrow=3,ncol=12) distTmp[1,] <- (x-newCx[1])^2 + (y-newCy[1])^2 distTmp[2,] <- (x-newCx[2])^2 + (y-newCy[2])^2 distTmp[3,] <- (x-newCx[3])^2 + (y-newCy[3])^2 finalClust <- apply(distTmp,2,which.min) ## Final centroids finalCx <- tapply(x,finalClust,mean) finalCy <- tapply(y,finalClust,mean) points(finalCx,finalCy,col=cols1,pch=3,cex=2,lwd=2) points(x,y,pch=19,cex=2,col=cols1[finalClust]) ####kmeans() dataFrame <- data.frame(x,y) kmeansObj <- kmeans(dataFrame,centers=3) names(kmeansObj) kmeansObj$cluster par(mar=rep(0.2,4)) plot(x,y,col=kmeansObj$cluster,pch=19,cex=2) points(kmeansObj$centers,col=1:3,pch=3,cex=3,lwd=3) ###mclust install.packages("mclust") library(mclust); data(faithful); faithfulMclust <- Mclust(faithful) summary(faithfulMclust,parameters=TRUE) ###### Pathological example clust1 = data.frame(x=rnorm(100),y=rnorm(100)) a = runif(100,0,2*pi) clust2 = data.frame(x=8*cos(a) + rnorm(100),y=8*sin(a) + rnorm(100)) plot(clust2,col='blue',pch=19); points(clust1,col='green',pch=19) ###cluster by k-means dat = rbind(clust1,clust2) kk = kmeans(dat,centers=2) plot(dat,col=(kk$clust+2),pch=19)