FateZ Clustering

This notebook demonstrate how to implement clustering method with FateZ’s representing method

[5]:
print('This part is yet to be modified!')

import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from pkg_resources import resource_filename
from sklearn import cluster
import fatez.test as test
import fatez.model as model
# import scanpy as sc

print('Done Import')
This part is yet to be modified!
Done Import

Initialize testing model first.

[6]:
faker = test.Faker()
testM, _ = faker.test_full_model()
# model.Save(faker.test_gat(), '../data/ignore/gat.model')
# model.Save(testM, '../data/ignore/trainer.model')
Testing Full Model.

        Pre-Trainer Green.

        Fine-Tuner Green.

Edge Explain:
 tensor([[0.1573, 0.1524, 0.1506, 0.1357, 0.1349, 0.1365, 0.1361, 0.1353, 0.0000,
         0.1383],
        [0.1358, 0.1370, 0.1243, 0.1215, 0.1255, 0.1237, 0.1196, 0.1621, 0.0000,
         0.1627],
        [0.1638, 0.1475, 0.1416, 0.1444, 0.1399, 0.1464, 0.1395, 0.1368, 0.0000,
         0.1393],
        [0.1305, 0.1263, 0.1264, 0.1260, 0.1198, 0.1493, 0.1356, 0.1604, 0.0000,
         0.1371]])

Reg Explain:
 tensor([0., 0., 0., 0.], dtype=torch.float64)

Node Explain:
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=torch.float64)

        Explainer Green.

Get the fake dataset

[8]:
dataset = faker.make_data_loader().dataset
for x in DataLoader(dataset, batch_size = len(dataset)):
    all_fea_mat = x[0]
    all_adj_mat = x[1]
print(f'Labels:\n{labels.tolist()}')
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
 in <cell line: 2>:2                                                                              
                                                                                                  
   1 dataset = faker.make_data_loader().dataset                                                   
 2 for x in DataLoader(dataset, batch_size = len(dataset)):                                     
   3 │   all_fea_mat = x[0]                                                                       
   4 │   all_adj_mat = x[1]                                                                       
   5 print(f'Labels:\n{labels.tolist()}')                                                         
                                                                                                  
 /usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:634 in __next__           
                                                                                                  
    631 │   │   │   if self._sampler_iter is None:                                                
    632 │   │   │   │   # TODO(https://github.com/pytorch/pytorch/issues/76750)                   
    633 │   │   │   │   self._reset()  # type: ignore[call-arg]                                   
  634 │   │   │   data = self._next_data()                                                      
    635 │   │   │   self._num_yielded += 1                                                        
    636 │   │   │   if self._dataset_kind == _DatasetKind.Iterable and \                          
    637 │   │   │   │   │   self._IterableDataset_len_called is not None and \                    
                                                                                                  
 /usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:678 in _next_data         
                                                                                                  
    675 │                                                                                         
    676 │   def _next_data(self):                                                                 
    677 │   │   index = self._next_index()  # may raise StopIteration                             
  678 │   │   data = self._dataset_fetcher.fetch(index)  # may raise StopIteration              
    679 │   │   if self._pin_memory:                                                              
    680 │   │   │   data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)            
    681 │   │   return data                                                                       
                                                                                                  
 /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py:54 in fetch             
                                                                                                  
   51 │   │   │   │   data = [self.dataset[idx] for idx in possibly_batched_index]                
   52 │   │   else:                                                                               
   53 │   │   │   data = self.dataset[possibly_batched_index]                                     
 54 │   │   return self.collate_fn(data)                                                        
   55                                                                                             
                                                                                                  
 /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py:264 in                
 default_collate                                                                                  
                                                                                                  
   261 │   │   │   >>> default_collate_fn_map.update(CustoType, collate_customtype_fn)            
   262 │   │   │   >>> default_collate(batch)  # Handle `CustomType` automatically                
   263 """                                                                                    
 264 return collate(batch, collate_fn_map=default_collate_fn_map)                           
   265                                                                                            
                                                                                                  
 /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py:150 in collate        
                                                                                                  
   147 │   │   │   │   # The sequence type may not support `__init__(iterable)` (e.g., `range`)   
   148 │   │   │   │   return [collate(samples, collate_fn_map=collate_fn_map) for samples in t   
   149 │                                                                                          
 150 raise TypeError(default_collate_err_msg_format.format(elem_type))                      
   151                                                                                            
   152                                                                                            
   153 def collate_tensor_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, ..   
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 
'torch_geometric.data.data.Data'>

Process origin data

[9]:
# Flatten Data
origin = np.array([torch.reshape(ele.to_dense(), (-1,)).tolist() for ele in all_fea_mat])

# PCA analysis for dimensionality deduction
pca_analysis = sc.pp.pca(origin, n_comps = 9, return_info = True,)
origin_pca = pca_analysis[0]
var_ratios = pca_analysis[2]
print(f'Origin Data Var Ratios:\n{var_ratios}')

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
 in <cell line: 2>:2                                                                              
                                                                                                  
   1 # Flatten Data                                                                               
 2 origin = np.array([torch.reshape(ele.to_dense(), (-1,)).tolist() for ele in all_fea_mat]     
   3                                                                                              
   4 # PCA analysis for dimensionality deduction                                                  
   5 pca_analysis = sc.pp.pca(origin, n_comps = 9, return_info = True,)                           
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
NameError: name 'all_fea_mat' is not defined

Process data with encoder

[10]:
# Get encoded representaions made by GAT -> BERT encoder
encode = np.array([
    torch.reshape(ele, (-1,)).tolist() for ele in testM.get_encoder_output(
        all_fea_mat, all_adj_mat
    )
])

# PCA analysis for dimensionality deduction
pca_analysis = sc.pp.pca(encode, n_comps = 9, return_info = True,)
encode_pca = pca_analysis[0]
var_ratios = pca_analysis[2]
print(f'Encoded Rep Var Ratios:\n{var_ratios}')
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
 in <cell line: 2>:3                                                                              
                                                                                                  
    1 # Get encoded representaions made by GAT -> BERT encoder                                    
    2 encode = np.array([                                                                         
  3 torch.reshape(ele, (-1,)).tolist() for ele in testM.get_encoder_output(                 
    4 │   │   all_fea_mat, all_adj_mat                                                            
    5 │   )                                                                                       
    6 ])                                                                                          
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
AttributeError: 'Trainer' object has no attribute 'get_encoder_output'

Set clustering models and fit models with original data

[11]:
eps = 0.5
n_clusters = len(np.unique(labels))
min_samples = 5

dbscan = cluster.DBSCAN(eps = eps)
kmeans = cluster.KMeans(n_clusters = n_clusters)
optics = cluster.OPTICS(min_samples = min_samples)

dbscan.fit(origin_pca)
kmeans.fit(origin_pca)
optics.fit(origin_pca)

# Get labels
print(dbscan.labels_.astype(int))
print(kmeans.labels_.astype(int))
print(optics.labels_.astype(int))
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
 in <cell line: 2>:2                                                                              
                                                                                                  
    1 eps = 0.5                                                                                   
  2 n_clusters = len(np.unique(labels))                                                         
    3 min_samples = 5                                                                             
    4                                                                                             
    5 dbscan = cluster.DBSCAN(eps = eps)                                                          
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
NameError: name 'labels' is not defined

Reset models and fit with encoded representaions

[12]:
dbscan = cluster.DBSCAN(eps = eps)
kmeans = cluster.KMeans(n_clusters = n_clusters)
optics = cluster.OPTICS(min_samples = min_samples)
dbscan.fit(encode_pca)
kmeans.fit(encode_pca)
optics.fit(encode_pca)

# Get labels
print(dbscan.labels_.astype(int))
print(kmeans.labels_.astype(int))
print(optics.labels_.astype(int))
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
 in <cell line: 2>:2                                                                              
                                                                                                  
    1 dbscan = cluster.DBSCAN(eps = eps)                                                          
  2 kmeans = cluster.KMeans(n_clusters = n_clusters)                                            
    3 optics = cluster.OPTICS(min_samples = min_samples)                                          
    4 dbscan.fit(encode_pca)                                                                      
    5 kmeans.fit(encode_pca)                                                                      
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
NameError: name 'n_clusters' is not defined