FateZ Clustering
This notebook demonstrate how to implement clustering method with FateZ’s representing method
[5]:
print('This part is yet to be modified!')
import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from pkg_resources import resource_filename
from sklearn import cluster
import fatez.test as test
import fatez.model as model
# import scanpy as sc
print('Done Import')
This part is yet to be modified!
Done Import
Initialize testing model first.
[6]:
faker = test.Faker()
testM, _ = faker.test_full_model()
# model.Save(faker.test_gat(), '../data/ignore/gat.model')
# model.Save(testM, '../data/ignore/trainer.model')
Testing Full Model.
Pre-Trainer Green.
Fine-Tuner Green.
Edge Explain:
tensor([[0.1573, 0.1524, 0.1506, 0.1357, 0.1349, 0.1365, 0.1361, 0.1353, 0.0000,
0.1383],
[0.1358, 0.1370, 0.1243, 0.1215, 0.1255, 0.1237, 0.1196, 0.1621, 0.0000,
0.1627],
[0.1638, 0.1475, 0.1416, 0.1444, 0.1399, 0.1464, 0.1395, 0.1368, 0.0000,
0.1393],
[0.1305, 0.1263, 0.1264, 0.1260, 0.1198, 0.1493, 0.1356, 0.1604, 0.0000,
0.1371]])
Reg Explain:
tensor([0., 0., 0., 0.], dtype=torch.float64)
Node Explain:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=torch.float64)
Explainer Green.
Get the fake dataset
[8]:
dataset = faker.make_data_loader().dataset
for x in DataLoader(dataset, batch_size = len(dataset)):
all_fea_mat = x[0]
all_adj_mat = x[1]
print(f'Labels:\n{labels.tolist()}')
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ in <cell line: 2>:2 │ │ │ │ 1 dataset = faker.make_data_loader().dataset │ │ ❱ 2 for x in DataLoader(dataset, batch_size = len(dataset)): │ │ 3 │ all_fea_mat = x[0] │ │ 4 │ all_adj_mat = x[1] │ │ 5 print(f'Labels:\n{labels.tolist()}') │ │ │ │ /usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:634 in __next__ │ │ │ │ 631 │ │ │ if self._sampler_iter is None: │ │ 632 │ │ │ │ # TODO(https://github.com/pytorch/pytorch/issues/76750) │ │ 633 │ │ │ │ self._reset() # type: ignore[call-arg] │ │ ❱ 634 │ │ │ data = self._next_data() │ │ 635 │ │ │ self._num_yielded += 1 │ │ 636 │ │ │ if self._dataset_kind == _DatasetKind.Iterable and \ │ │ 637 │ │ │ │ │ self._IterableDataset_len_called is not None and \ │ │ │ │ /usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:678 in _next_data │ │ │ │ 675 │ │ │ 676 │ def _next_data(self): │ │ 677 │ │ index = self._next_index() # may raise StopIteration │ │ ❱ 678 │ │ data = self._dataset_fetcher.fetch(index) # may raise StopIteration │ │ 679 │ │ if self._pin_memory: │ │ 680 │ │ │ data = _utils.pin_memory.pin_memory(data, self._pin_memory_device) │ │ 681 │ │ return data │ │ │ │ /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py:54 in fetch │ │ │ │ 51 │ │ │ │ data = [self.dataset[idx] for idx in possibly_batched_index] │ │ 52 │ │ else: │ │ 53 │ │ │ data = self.dataset[possibly_batched_index] │ │ ❱ 54 │ │ return self.collate_fn(data) │ │ 55 │ │ │ │ /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py:264 in │ │ default_collate │ │ │ │ 261 │ │ │ >>> default_collate_fn_map.update(CustoType, collate_customtype_fn) │ │ 262 │ │ │ >>> default_collate(batch) # Handle `CustomType` automatically │ │ 263 │ """ │ │ ❱ 264 │ return collate(batch, collate_fn_map=default_collate_fn_map) │ │ 265 │ │ │ │ /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py:150 in collate │ │ │ │ 147 │ │ │ │ # The sequence type may not support `__init__(iterable)` (e.g., `range`) │ │ 148 │ │ │ │ return [collate(samples, collate_fn_map=collate_fn_map) for samples in t │ │ 149 │ │ │ ❱ 150 │ raise TypeError(default_collate_err_msg_format.format(elem_type)) │ │ 151 │ │ 152 │ │ 153 def collate_tensor_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, .. │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'torch_geometric.data.data.Data'>
Process origin data
[9]:
# Flatten Data
origin = np.array([torch.reshape(ele.to_dense(), (-1,)).tolist() for ele in all_fea_mat])
# PCA analysis for dimensionality deduction
pca_analysis = sc.pp.pca(origin, n_comps = 9, return_info = True,)
origin_pca = pca_analysis[0]
var_ratios = pca_analysis[2]
print(f'Origin Data Var Ratios:\n{var_ratios}')
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ in <cell line: 2>:2 │ │ │ │ 1 # Flatten Data │ │ ❱ 2 origin = np.array([torch.reshape(ele.to_dense(), (-1,)).tolist() for ele in all_fea_mat] │ │ 3 │ │ 4 # PCA analysis for dimensionality deduction │ │ 5 pca_analysis = sc.pp.pca(origin, n_comps = 9, return_info = True,) │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ NameError: name 'all_fea_mat' is not defined
Process data with encoder
[10]:
# Get encoded representaions made by GAT -> BERT encoder
encode = np.array([
torch.reshape(ele, (-1,)).tolist() for ele in testM.get_encoder_output(
all_fea_mat, all_adj_mat
)
])
# PCA analysis for dimensionality deduction
pca_analysis = sc.pp.pca(encode, n_comps = 9, return_info = True,)
encode_pca = pca_analysis[0]
var_ratios = pca_analysis[2]
print(f'Encoded Rep Var Ratios:\n{var_ratios}')
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ in <cell line: 2>:3 │ │ │ │ 1 # Get encoded representaions made by GAT -> BERT encoder │ │ 2 encode = np.array([ │ │ ❱ 3 │ torch.reshape(ele, (-1,)).tolist() for ele in testM.get_encoder_output( │ │ 4 │ │ all_fea_mat, all_adj_mat │ │ 5 │ ) │ │ 6 ]) │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ AttributeError: 'Trainer' object has no attribute 'get_encoder_output'
Set clustering models and fit models with original data
[11]:
eps = 0.5
n_clusters = len(np.unique(labels))
min_samples = 5
dbscan = cluster.DBSCAN(eps = eps)
kmeans = cluster.KMeans(n_clusters = n_clusters)
optics = cluster.OPTICS(min_samples = min_samples)
dbscan.fit(origin_pca)
kmeans.fit(origin_pca)
optics.fit(origin_pca)
# Get labels
print(dbscan.labels_.astype(int))
print(kmeans.labels_.astype(int))
print(optics.labels_.astype(int))
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ in <cell line: 2>:2 │ │ │ │ 1 eps = 0.5 │ │ ❱ 2 n_clusters = len(np.unique(labels)) │ │ 3 min_samples = 5 │ │ 4 │ │ 5 dbscan = cluster.DBSCAN(eps = eps) │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ NameError: name 'labels' is not defined
Reset models and fit with encoded representaions
[12]:
dbscan = cluster.DBSCAN(eps = eps)
kmeans = cluster.KMeans(n_clusters = n_clusters)
optics = cluster.OPTICS(min_samples = min_samples)
dbscan.fit(encode_pca)
kmeans.fit(encode_pca)
optics.fit(encode_pca)
# Get labels
print(dbscan.labels_.astype(int))
print(kmeans.labels_.astype(int))
print(optics.labels_.astype(int))
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ in <cell line: 2>:2 │ │ │ │ 1 dbscan = cluster.DBSCAN(eps = eps) │ │ ❱ 2 kmeans = cluster.KMeans(n_clusters = n_clusters) │ │ 3 optics = cluster.OPTICS(min_samples = min_samples) │ │ 4 dbscan.fit(encode_pca) │ │ 5 kmeans.fit(encode_pca) │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ NameError: name 'n_clusters' is not defined