|
| 1 | +import warnings |
| 2 | +import pandas as pd |
| 3 | +from scipy.cluster.hierarchy import linkage, dendrogram, cut_tree |
| 4 | + |
| 5 | + |
| 6 | +def fast_cluster(array, method, metric): |
| 7 | + import fastcluster |
| 8 | + euclidean_methods = ('centroid', 'median', 'ward') |
| 9 | + euclidean = metric == 'euclidean' and method in euclidean_methods |
| 10 | + if euclidean or method == 'single': |
| 11 | + _linkage = fastcluster.linkage_vector(array, |
| 12 | + method=method, |
| 13 | + metric=metric) |
| 14 | + else: |
| 15 | + _linkage = fastcluster.linkage(array, method=method, |
| 16 | + metric=metric) |
| 17 | + return _linkage |
| 18 | + |
| 19 | + |
| 20 | +def make_linkage(array, method, metric): |
| 21 | + try: |
| 22 | + return fast_cluster(array, method, metric) |
| 23 | + except ImportError: |
| 24 | + if len(array) >= 10000: |
| 25 | + msg = ("Clustering large matrix with scipy. Installing " |
| 26 | + "`fastcluster` may give better performance.") |
| 27 | + warnings.warn(msg) |
| 28 | + |
| 29 | + _linkage = linkage(array, method=method, metric=metric) |
| 30 | + return _linkage |
| 31 | + |
| 32 | + |
| 33 | +def cluster_hierarchy(data, method, axis, metric='euclidean', n_clusters=None): |
| 34 | + """ |
| 35 | + data :pandas.DataFrame |
| 36 | + Rectangular data |
| 37 | + method :str, 'single', 'centroid', 'median', 'ward' |
| 38 | + axis : int, optional |
| 39 | + Which axis to use to calculate linkage. 0 is rows, 1 is columns. |
| 40 | + metric : "eulidean" |
| 41 | + n_cluster: int, optional |
| 42 | + return the cut tree. |
| 43 | + no_plot: bool, optional |
| 44 | + When True, the final rendering is not performed. This is |
| 45 | + useful if only the data structures computed for the rendering |
| 46 | + are needed or if matplotlib is not available. |
| 47 | + """ |
| 48 | + data = data.copy() |
| 49 | + if axis == 1: |
| 50 | + data = data.T |
| 51 | + array = data.values |
| 52 | + _linkage = make_linkage(array, method, metric) |
| 53 | + |
| 54 | + if n_clusters is not None: |
| 55 | + cut_result = cut_tree(linkage, n_clusters=n_clusters) |
| 56 | + df_cut = pd.DataFrame(cut_result.flatten()) |
| 57 | + label = df_cut.iloc[:, 0].sort_values(ascending=True, inplace=False).index.values |
| 58 | + return data.index.values[label] |
| 59 | + _result = dendrogram(_linkage, no_plot=True) |
| 60 | + _reordered_index = data.index.values[_result['leaves']] |
| 61 | + return _reordered_index |
0 commit comments