setwd("C:\\Users\\authorized user\\Dropbox\\STAT445\\week9\\lectures")

cdata=read.csv("Assignment 7 Cereal Data.csv")

d=dist(cdata[,4:12])

#rownames(d)=cdata[,2]

hc=hclust(d,method='centroid')
plot(hc,hang=-1,label=cdata[,2])


hc1=hclust(d,method='ward')
plot(hc1,hang=-1,label=cdata[,2], pch=c(rep(NA,16),15 ,rep(NA,length(cdata[,2])-1 )))

# Ward's method tends to combine individual cereals 
#together before combining larger clusters together, 
#and ends up with more evenly sized sets of clusters at higher levels. 
#Also there are inversions present in dendogram produced by the
#centroid method.
#ward's tends to end up with equally sized clusters


#run kmeans for n clusters
km_n=function(n_clusters=3){


out=list()
name_clusters=rep(NA,n_clusters)
km=kmeans(cdata[,4:12],centers=n_clusters)$cluster
for (i in (1:n_clusters))
{  
out[[i]]=cdata[km==i,2]
name_clusters[i]=paste("cluster",i, sep="")
}
names(out)=name_clusters
return(out)

}

#run kmeans for 3 clusters
km_n(3)
#run kmeans for 4 clusters
km_n(4)
#run kmeans for 5 clusters
km_n(5)



#ward and kmeans are more similar to each other
#as expected



#scree plot of total within cluster sum of squares
#this is more efficient way by using sapply or lapply function
#it is avoiding for loops,
#which are very time consuming in r
sp=unlist(sapply(1:10, function(x) kmeans(cdata[,4:12],centers=x,nstart=10)$tot.withinss))
#plot sp
plot(1:10,sp,type="b",main="Scree plot",ylab="total within cluster sum of squares", xlab="number of clusters")


#analogous function by using for loop would be this:
sp1=rep(NA,10)
for(i in (1:10)){
  sp1[i]=kmeans(cdata[,4:12],centers=i,nstart=10)$tot.withinss
}
#plot sp1  
plot(1:10,sp1,type="b",main="Scree plot",ylab="total within cluster sum of squares", xlab="number of clusters")




# g.  Does this scree plot give any clear indication of the appropriate number of clusters to use? Explain in at most one sentence. 

# The elbows at K = 3 and 6 suggest setting K to somewhere in this range. Also, anyone who is familiar with many of these breakfast cereals will recognize that the groupings with K=3 do indeed highlight major types with, e.g., ones that concentrate on flavour without much regard to nutrition tending to fall into the same group. I'd be inclined to opt for a smaller number of groupings if I felt that this was sufficient to highlight such relevant aspects of the data. 
# But I would likely be inclined to show a dendrogram if I felt that my audience could understand what it portrayed reasonably easily, and then I could describe the major groupings along with some lesser ones if they were relevant as well. Figures are usually more useful than tables and lists. 
# The dendrogram for Ward's method shows very clearly how a few of the "junk food" cereals stand out by themselves and that there are two other reasonably distinct, large groups. It also shows how each of these splits into two subgroups at about the same "distance". It's also easy to search through the cereal names at the bottom to see visually where cereals you recognize appear in the clustering. 
# 
# h.	The function, "kmeans" will not let you construct a set of K = 43 clusters, but you can easily figure out what it would have to be. What would the value of this within-cluster sum of squares be for K = 43?

# It would be 0 since each item would form its own cluster with the centroid identical to the vector of observed values. 

# i.	You were not asked to standardize the variables before proceeding. Might there be some advantage to doing so? Explain in at most two sentences. (For the purpose of this assignment, there is no need to do any further computations.)

# Yes, there well might. The scales of these measurements are not comparable. Quantities like sodium amounts which have a much larger range than, e.g., fibre content may have exerted undue influence in the version applied here. 
#