setwd("C:\\Users\\authorized user\\Dropbox\\STAT445\\week7\\tutorials") ############################################################################### #Some data sets and hierarchical clustering methods such as centroid and #median methods, can produce inversions. #An inversion occurs when a new cluster merges the existing cluster at a #smaller distance than that of previous merger (coalescence). #This occurs due to the property of non-monotinicity i.e. increasing similarity #from iteration to iteration in the case od the centroid and median #method.In Single and complete linkage methods similarity is monotonically #decreasing from iteration to iteration. So these methods do not suffer from #the inversion issue. ############################################################################### ############################################################################### #example data set when changing the distance method fixes the inversion ############################################################################### data <- matrix(c(0.96, 0.07, 0.97, 0.98, 0.50, 0.28, 0.29, 0.77, 0.08, 0.96, 0.51, 0.51, 0.14, 0.19, 0.41, 0.51), ncol=4, byrow=TRUE) colnames(data) <- c("Exp1","Exp2","Exp3","Exp4") rownames(data) <- c("Gene1","Gene2","Gene3", "Gene4") #distance method: maximum mat <- dist(data,method="maximum") hc_c=hclust(mat, method='centroid') plot(hc_c,hang=-1) #distance method: euclidian mat <- dist(data,method="euclidian") hc_c=hclust(mat, method='centroid') plot(hc_c,hang=-1) ############################################################################### #But there exist data sets where inversion cannnot be fixed by changing the #distance method !!!!!! #Bellow is such example. ############################################################################### data <- matrix(c(0.96, 0.07, 0.97, 0.98, 0.99, 0.50, 0.28, 0.29, 0.77, 0.78, 0.08, 0.96, 0.51, 0.51, 0.55, 0.14, 0.19, 0.41, 0.51, 0.40, 0.97, 0.98, 0.99, 0.50), ncol=6, byrow=TRUE) colnames(data) <- c("Exp1","Exp2","Exp3","Exp4","Exp5","Exp6") rownames(data) <- c("Gene1","Gene2","Gene3", "Gene4") #distance method: maximum mat <- dist(data,method="maximum") hc_c=hclust(mat, method='centroid') plot(hc_c,hang=-1) #distance method: euclidian mat <- dist(data,method="euclidian") hc_c=hclust(mat, method='centroid') plot(hc_c,hang=-1) #distance method: manhattan (Absolute distance between the two vectors) mat <- dist(data,method="manhattan") hc_c=hclust(mat, method='centroid') plot(hc_c,hang=-1) #distance method: canberra (sum(|x_i - y_i| / |x_i + y_i|)) mat <- dist(data,method="canberra") hc_c=hclust(mat, method='centroid') plot(hc_c,hang=-1) #distance method: minkowski (The p norm, the pth root #of the sum of the pth powers of the differences of the components) mat <- dist(data,method="minkowski") hc_c=hclust(mat, method='centroid') plot(hc_c,hang=-1) ############################################################################### ############################################################################## #solution would be to ise another clustering technique that does not suffer # from the inversion issue. ############################################################################### hc_a=hclust(mat, method='average') plot(hc_a,hang=-1) #single and complete linkage a monotonic hc_s=hclust(mat, method='single') plot(hc_s,hang=-1) hc_cm=hclust(mat, method='complete') plot(hc_cm,hang=-1) ###############################################################################