setwd("C:\\Users\\authorized user\\Dropbox\\STAT445\\week7\\tutorials")

###############################################################################
#Some data sets and hierarchical clustering methods such as centroid and 
#median methods, can produce inversions.
#An inversion occurs when a new cluster merges the existing cluster at a 
#smaller distance than that of previous merger (coalescence).
#This occurs due to the property of non-monotinicity i.e. increasing similarity 
#from iteration to iteration in the case od the centroid and median
#method.In Single and complete linkage methods similarity is monotonically
#decreasing from iteration to iteration. So these methods do not suffer from 
#the inversion issue.
###############################################################################



###############################################################################
#example data set when changing the distance method fixes the inversion
###############################################################################
data           <- matrix(c(0.96, 0.07, 0.97, 0.98, 
                           0.50, 0.28, 0.29, 0.77, 
                           0.08, 0.96, 0.51, 0.51, 
                           0.14, 0.19, 0.41, 0.51), ncol=4, byrow=TRUE)

colnames(data) <- c("Exp1","Exp2","Exp3","Exp4")
rownames(data) <- c("Gene1","Gene2","Gene3", "Gene4")

#distance method: maximum
mat            <- dist(data,method="maximum")

hc_c=hclust(mat, method='centroid')
plot(hc_c,hang=-1)

#distance method: euclidian

mat            <- dist(data,method="euclidian")

hc_c=hclust(mat, method='centroid')
plot(hc_c,hang=-1)




###############################################################################
#But there exist data sets where inversion cannnot be fixed by changing the 
#distance method !!!!!!
#Bellow is such example.
###############################################################################
data           <- matrix(c(0.96, 0.07, 0.97, 0.98, 0.99, 0.50, 
                           0.28, 0.29, 0.77, 0.78, 0.08, 0.96, 
                           0.51, 0.51, 0.55, 0.14, 0.19, 0.41, 
                           0.51, 0.40, 0.97, 0.98, 0.99, 0.50), ncol=6, byrow=TRUE)



colnames(data) <- c("Exp1","Exp2","Exp3","Exp4","Exp5","Exp6")
rownames(data) <- c("Gene1","Gene2","Gene3", "Gene4")

#distance method: maximum
mat            <- dist(data,method="maximum")

hc_c=hclust(mat, method='centroid')
plot(hc_c,hang=-1)


#distance method: euclidian
mat            <- dist(data,method="euclidian")

hc_c=hclust(mat, method='centroid')
plot(hc_c,hang=-1)


#distance method: manhattan (Absolute distance between the two vectors)
mat            <- dist(data,method="manhattan")

hc_c=hclust(mat, method='centroid')
plot(hc_c,hang=-1)


#distance method: canberra (sum(|x_i - y_i| / |x_i + y_i|))
mat            <- dist(data,method="canberra")

hc_c=hclust(mat, method='centroid')
plot(hc_c,hang=-1)



#distance method: minkowski (The p norm, the pth root 
#of the sum of the pth powers of the differences of the components)
mat            <- dist(data,method="minkowski")

hc_c=hclust(mat, method='centroid')
plot(hc_c,hang=-1)
###############################################################################


##############################################################################
#solution would be to ise another clustering technique that does not suffer
# from the inversion issue.
###############################################################################
hc_a=hclust(mat, method='average')
plot(hc_a,hang=-1)

#single and complete linkage a monotonic
hc_s=hclust(mat, method='single')
plot(hc_s,hang=-1)


hc_cm=hclust(mat, method='complete')
plot(hc_cm,hang=-1)
###############################################################################