{ "cells": [ { "cell_type": "markdown", "id": "b69b2ab7", "metadata": {}, "source": [ "# FateZ Clustering \n", "\n", "This notebook demonstrate how to implement clustering method with FateZ's representing method" ] }, { "cell_type": "code", "execution_count": 5, "id": "dd050393", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "This part is yet to be modified!\n", "Done Import\n" ] } ], "source": [ "print('This part is yet to be modified!')\n", "\n", "import os\n", "import torch\n", "import numpy as np\n", "from torch.utils.data import DataLoader\n", "from pkg_resources import resource_filename\n", "from sklearn import cluster\n", "import fatez.test as test\n", "import fatez.model as model\n", "# import scanpy as sc\n", "\n", "print('Done Import')" ] }, { "cell_type": "markdown", "id": "10210c8d", "metadata": {}, "source": [ "### Initialize testing model first." ] }, { "cell_type": "code", "execution_count": 6, "id": "3948e182", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Testing Full Model.\n", "\n", "\tPre-Trainer Green.\n", "\n", "\tFine-Tuner Green.\n", "\n", "Edge Explain:\n", " tensor([[0.1573, 0.1524, 0.1506, 0.1357, 0.1349, 0.1365, 0.1361, 0.1353, 0.0000,\n", " 0.1383],\n", " [0.1358, 0.1370, 0.1243, 0.1215, 0.1255, 0.1237, 0.1196, 0.1621, 0.0000,\n", " 0.1627],\n", " [0.1638, 0.1475, 0.1416, 0.1444, 0.1399, 0.1464, 0.1395, 0.1368, 0.0000,\n", " 0.1393],\n", " [0.1305, 0.1263, 0.1264, 0.1260, 0.1198, 0.1493, 0.1356, 0.1604, 0.0000,\n", " 0.1371]]) \n", "\n", "Reg Explain:\n", " tensor([0., 0., 0., 0.], dtype=torch.float64) \n", "\n", "Node Explain:\n", " tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=torch.float64) \n", "\n", "\tExplainer Green.\n", "\n" ] } ], "source": [ "faker = test.Faker()\n", "testM, _ = faker.test_full_model()\n", "# model.Save(faker.test_gat(), '../data/ignore/gat.model')\n", "# model.Save(testM, '../data/ignore/trainer.model')" ] }, { "cell_type": "markdown", "id": "255aa807", "metadata": {}, "source": [ "### Get the fake dataset" ] }, { "cell_type": "code", "execution_count": 8, "id": "82ce2fb9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n", "│ in <cell line: 2>:2 │\n", "│ │\n", "│ 1 dataset = faker.make_data_loader().dataset │\n", "│ ❱ 2 for x in DataLoader(dataset, batch_size = len(dataset)): │\n", "│ 3 │ all_fea_mat = x[0] │\n", "│ 4 │ all_adj_mat = x[1] │\n", "│ 5 print(f'Labels:\\n{labels.tolist()}') │\n", "│ │\n", "│ /usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:634 in __next__ │\n", "│ │\n", "│ 631 │ │ │ if self._sampler_iter is None: │\n", "│ 632 │ │ │ │ # TODO(https://github.com/pytorch/pytorch/issues/76750) │\n", "│ 633 │ │ │ │ self._reset() # type: ignore[call-arg] │\n", "│ ❱ 634 │ │ │ data = self._next_data() │\n", "│ 635 │ │ │ self._num_yielded += 1 │\n", "│ 636 │ │ │ if self._dataset_kind == _DatasetKind.Iterable and \\ │\n", "│ 637 │ │ │ │ │ self._IterableDataset_len_called is not None and \\ │\n", "│ │\n", "│ /usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py:678 in _next_data │\n", "│ │\n", "│ 675 │ │\n", "│ 676 │ def _next_data(self): │\n", "│ 677 │ │ index = self._next_index() # may raise StopIteration │\n", "│ ❱ 678 │ │ data = self._dataset_fetcher.fetch(index) # may raise StopIteration │\n", "│ 679 │ │ if self._pin_memory: │\n", "│ 680 │ │ │ data = _utils.pin_memory.pin_memory(data, self._pin_memory_device) │\n", "│ 681 │ │ return data │\n", "│ │\n", "│ /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py:54 in fetch │\n", "│ │\n", "│ 51 │ │ │ │ data = [self.dataset[idx] for idx in possibly_batched_index] │\n", "│ 52 │ │ else: │\n", "│ 53 │ │ │ data = self.dataset[possibly_batched_index] │\n", "│ ❱ 54 │ │ return self.collate_fn(data) │\n", "│ 55 │\n", "│ │\n", "│ /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py:264 in │\n", "│ default_collate │\n", "│ │\n", "│ 261 │ │ │ >>> default_collate_fn_map.update(CustoType, collate_customtype_fn) │\n", "│ 262 │ │ │ >>> default_collate(batch) # Handle `CustomType` automatically │\n", "│ 263 │ \"\"\" │\n", "│ ❱ 264 │ return collate(batch, collate_fn_map=default_collate_fn_map) │\n", "│ 265 │\n", "│ │\n", "│ /usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/collate.py:150 in collate │\n", "│ │\n", "│ 147 │ │ │ │ # The sequence type may not support `__init__(iterable)` (e.g., `range`) │\n", "│ 148 │ │ │ │ return [collate(samples, collate_fn_map=collate_fn_map) for samples in t │\n", "│ 149 │ │\n", "│ ❱ 150 │ raise TypeError(default_collate_err_msg_format.format(elem_type)) │\n", "│ 151 │\n", "│ 152 │\n", "│ 153 def collate_tensor_fn(batch, *, collate_fn_map: Optional[Dict[Union[Type, Tuple[Type, .. │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class \n", "'torch_geometric.data.data.Data'>\n", "\n" ], "text/plain": [ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m in \u001b[92m
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n", "│ in <cell line: 2>:2 │\n", "│ │\n", "│ 1 # Flatten Data │\n", "│ ❱ 2 origin = np.array([torch.reshape(ele.to_dense(), (-1,)).tolist() for ele in all_fea_mat] │\n", "│ 3 │\n", "│ 4 # PCA analysis for dimensionality deduction │\n", "│ 5 pca_analysis = sc.pp.pca(origin, n_comps = 9, return_info = True,) │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "NameError: name 'all_fea_mat' is not defined\n", "\n" ], "text/plain": [ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m in \u001b[92m
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n", "│ in <cell line: 2>:3 │\n", "│ │\n", "│ 1 # Get encoded representaions made by GAT -> BERT encoder │\n", "│ 2 encode = np.array([ │\n", "│ ❱ 3 │ torch.reshape(ele, (-1,)).tolist() for ele in testM.get_encoder_output( │\n", "│ 4 │ │ all_fea_mat, all_adj_mat │\n", "│ 5 │ ) │\n", "│ 6 ]) │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "AttributeError: 'Trainer' object has no attribute 'get_encoder_output'\n", "\n" ], "text/plain": [ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m in \u001b[92m
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n", "│ in <cell line: 2>:2 │\n", "│ │\n", "│ 1 eps = 0.5 │\n", "│ ❱ 2 n_clusters = len(np.unique(labels)) │\n", "│ 3 min_samples = 5 │\n", "│ 4 │\n", "│ 5 dbscan = cluster.DBSCAN(eps = eps) │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "NameError: name 'labels' is not defined\n", "\n" ], "text/plain": [ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m in \u001b[92m
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n", "│ in <cell line: 2>:2 │\n", "│ │\n", "│ 1 dbscan = cluster.DBSCAN(eps = eps) │\n", "│ ❱ 2 kmeans = cluster.KMeans(n_clusters = n_clusters) │\n", "│ 3 optics = cluster.OPTICS(min_samples = min_samples) │\n", "│ 4 dbscan.fit(encode_pca) │\n", "│ 5 kmeans.fit(encode_pca) │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "NameError: name 'n_clusters' is not defined\n", "\n" ], "text/plain": [ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m in \u001b[92m