FateZ Clustering

This notebook demonstrate how to implement clustering method with FateZ’s representing method

[5]:

print('This part is yet to be modified!')

import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from pkg_resources import resource_filename
from sklearn import cluster
import fatez.test as test
import fatez.model as model
# import scanpy as sc

print('Done Import')

This part is yet to be modified!
Done Import

Initialize testing model first.

[6]:

faker = test.Faker()
testM, _ = faker.test_full_model()
# model.Save(faker.test_gat(), '../data/ignore/gat.model')
# model.Save(testM, '../data/ignore/trainer.model')

Testing Full Model.

        Pre-Trainer Green.

        Fine-Tuner Green.

Edge Explain:
 tensor([[0.1573, 0.1524, 0.1506, 0.1357, 0.1349, 0.1365, 0.1361, 0.1353, 0.0000,
         0.1383],
        [0.1358, 0.1370, 0.1243, 0.1215, 0.1255, 0.1237, 0.1196, 0.1621, 0.0000,
         0.1627],
        [0.1638, 0.1475, 0.1416, 0.1444, 0.1399, 0.1464, 0.1395, 0.1368, 0.0000,
         0.1393],
        [0.1305, 0.1263, 0.1264, 0.1260, 0.1198, 0.1493, 0.1356, 0.1604, 0.0000,
         0.1371]])

Reg Explain:
 tensor([0., 0., 0., 0.], dtype=torch.float64)

Node Explain:
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=torch.float64)

        Explainer Green.

Get the fake dataset

[8]:

dataset = faker.make_data_loader().dataset
for x in DataLoader(dataset, batch_size = len(dataset)):
    all_fea_mat = x[0]
    all_adj_mat = x[1]
print(f'Labels:\n{labels.tolist()}')

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ in <cell line: 2>:2                                                                              │
│                                                                                                  │
│   1 dataset = faker.make_data_loader().dataset                                                   │
│ ❱ 2 for x in DataLoader(dataset, batch_size = len(dataset)):                                     │
│   3 │   all_fea_mat = x[0]                                                                       │
│   4 │   all_adj_mat = x[1]                                                                       │
│   5 print(f'Labels:\n{labels.tolist()}')                                                         │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:634 in __next__           │
│                                                                                                  │
│    631 │   │   │   if self._sampler_iter is None:                                                │
│    632 │   │   │   │   # TODO(https://github.com/pytorch/pytorch/issues/76750)                   │
│    633 │   │   │   │   self._reset()  # type: ignore[call-arg]                                   │
│ ❱  634 │   │   │   data = self._next_data()                                                      │
│    635 │   │   │   self._num_yielded += 1                                                        │
│    636 │   │   │   if self._dataset_kind == _DatasetKind.Iterable and \                          │
│    637 │   │   │   │   │   self._IterableDataset_len_called is not None and \                    │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:678 in _next_data         │
│                                                                                                  │
│    675 │                                                                                         │
│    676 │   def _next_data(self):                                                                 │
│    677 │   │   index = self._next_index()  # may raise StopIteration                             │
│ ❱  678 │   │   data = self._dataset_fetcher.fetch(index)  # may raise StopIteration              │
│    679 │   │   if self._pin_memory:                                                              │
│    680 │   │   │   data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)            │
│    681 │   │   return data                                                                       │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py:54 in fetch             │
│                                                                                                  │
│   51 │   │   │   │   data = [self.dataset[idx] for idx in possibly_batched_index]                │
│   52 │   │   else:                                                                               │
│   53 │   │   │   data = self.dataset[possibly_batched_index]                                     │
│ ❱ 54 │   │   return self.collate_fn(data)                                                        │
│   55                                                                                             │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py:264 in                │
│ default_collate                                                                                  │
│                                                                                                  │
│   261 │   │   │   >>> default_collate_fn_map.update(CustoType, collate_customtype_fn)            │
│   262 │   │   │   >>> default_collate(batch)  # Handle `CustomType` automatically                │
│   263 │   """                                                                                    │
│ ❱ 264 │   return collate(batch, collate_fn_map=default_collate_fn_map)                           │
│   265                                                                                            │
│                                                                                                  │
│ /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py:150 in collate        │
│                                                                                                  │
│   147 │   │   │   │   # The sequence type may not support `__init__(iterable)` (e.g., `range`)   │
│   148 │   │   │   │   return [collate(samples, collate_fn_map=collate_fn_map) for samples in t   │
│   149 │                                                                                          │
│ ❱ 150 │   raise TypeError(default_collate_err_msg_format.format(elem_type))                      │
│   151                                                                                            │
│   152                                                                                            │
│   153 def collate_tensor_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ..   │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 
'torch_geometric.data.data.Data'>

Process origin data

[9]:

# Flatten Data
origin = np.array([torch.reshape(ele.to_dense(), (-1,)).tolist() for ele in all_fea_mat])

# PCA analysis for dimensionality deduction
pca_analysis = sc.pp.pca(origin, n_comps = 9, return_info = True,)
origin_pca = pca_analysis[0]
var_ratios = pca_analysis[2]
print(f'Origin Data Var Ratios:\n{var_ratios}')

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ in <cell line: 2>:2                                                                              │
│                                                                                                  │
│   1 # Flatten Data                                                                               │
│ ❱ 2 origin = np.array([torch.reshape(ele.to_dense(), (-1,)).tolist() for ele in all_fea_mat]     │
│   3                                                                                              │
│   4 # PCA analysis for dimensionality deduction                                                  │
│   5 pca_analysis = sc.pp.pca(origin, n_comps = 9, return_info = True,)                           │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
NameError: name 'all_fea_mat' is not defined

Process data with encoder

[10]:

# Get encoded representaions made by GAT -> BERT encoder
encode = np.array([
    torch.reshape(ele, (-1,)).tolist() for ele in testM.get_encoder_output(
        all_fea_mat, all_adj_mat
    )
])

# PCA analysis for dimensionality deduction
pca_analysis = sc.pp.pca(encode, n_comps = 9, return_info = True,)
encode_pca = pca_analysis[0]
var_ratios = pca_analysis[2]
print(f'Encoded Rep Var Ratios:\n{var_ratios}')

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ in <cell line: 2>:3                                                                              │
│                                                                                                  │
│    1 # Get encoded representaions made by GAT -> BERT encoder                                    │
│    2 encode = np.array([                                                                         │
│ ❱  3 │   torch.reshape(ele, (-1,)).tolist() for ele in testM.get_encoder_output(                 │
│    4 │   │   all_fea_mat, all_adj_mat                                                            │
│    5 │   )                                                                                       │
│    6 ])                                                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
AttributeError: 'Trainer' object has no attribute 'get_encoder_output'

Set clustering models and fit models with original data

[11]:

eps = 0.5
n_clusters = len(np.unique(labels))
min_samples = 5

dbscan = cluster.DBSCAN(eps = eps)
kmeans = cluster.KMeans(n_clusters = n_clusters)
optics = cluster.OPTICS(min_samples = min_samples)

dbscan.fit(origin_pca)
kmeans.fit(origin_pca)
optics.fit(origin_pca)

# Get labels
print(dbscan.labels_.astype(int))
print(kmeans.labels_.astype(int))
print(optics.labels_.astype(int))

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ in <cell line: 2>:2                                                                              │
│                                                                                                  │
│    1 eps = 0.5                                                                                   │
│ ❱  2 n_clusters = len(np.unique(labels))                                                         │
│    3 min_samples = 5                                                                             │
│    4                                                                                             │
│    5 dbscan = cluster.DBSCAN(eps = eps)                                                          │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
NameError: name 'labels' is not defined

Reset models and fit with encoded representaions

[12]:

dbscan = cluster.DBSCAN(eps = eps)
kmeans = cluster.KMeans(n_clusters = n_clusters)
optics = cluster.OPTICS(min_samples = min_samples)
dbscan.fit(encode_pca)
kmeans.fit(encode_pca)
optics.fit(encode_pca)

# Get labels
print(dbscan.labels_.astype(int))
print(kmeans.labels_.astype(int))
print(optics.labels_.astype(int))

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ in <cell line: 2>:2                                                                              │
│                                                                                                  │
│    1 dbscan = cluster.DBSCAN(eps = eps)                                                          │
│ ❱  2 kmeans = cluster.KMeans(n_clusters = n_clusters)                                            │
│    3 optics = cluster.OPTICS(min_samples = min_samples)                                          │
│    4 dbscan.fit(encode_pca)                                                                      │
│    5 kmeans.fit(encode_pca)                                                                      │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
NameError: name 'n_clusters' is not defined