GemsLab
diff --git a/‎.DS_Store‎
0 Bytes b/‎.DS_Store‎
0 Bytes
diff --git a/‎Example.ipynb‎
Lines changed: 115 additions & 45 deletions b/‎Example.ipynb‎
Lines changed: 115 additions & 45 deletions
diff --git a/‎semb/.DS_Store‎
0 Bytes b/‎semb/.DS_Store‎
0 Bytes
diff --git a/‎semb/datasets/BlogCatalog/dataset.py‎
Lines changed: 36 additions & 0 deletions b/‎semb/datasets/BlogCatalog/dataset.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎semb/datasets/DD6/dataset.py‎
Lines changed: 36 additions & 0 deletions b/‎semb/datasets/DD6/dataset.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎semb/datasets/Facebook/dataset.py‎
Lines changed: 36 additions & 0 deletions b/‎semb/datasets/Facebook/dataset.py‎
Lines changed: 36 additions & 0 deletions
@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -40,15 +40,27 @@
     "    load_method(mid)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "These are the method_id for the existing datasets."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "BlogCatalog\n",
+      "ICEWS\n",
+      "Facebook\n",
+      "DD6\n",
+      "PPI\n",
       "airports\n"
      ]
     }
@@ -61,6 +73,57 @@
     "    load_dataset(did)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "These are the dataset_id for the existing datasets."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get airports datasets\n",
+    "DataProvider = load_dataset(\"airports\")\n",
+    "Datasets = DataProvider().get_datasets()\n",
+    "dataset_graph = DataProvider().load_dataset(Datasets[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that there are three datasets in the airports dataset.\n",
+    "\n",
+    "Datasets\\[0\\] represents the BR-air traffic Dataset\n",
+    "\n",
+    "Datasets\\[1\\] represents the EU-air traffic Dataset\n",
+    "\n",
+    "Datasets\\[2\\] represents the US-air traffic Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example code for getting the other datasets\n",
+    "DataProvider = load_dataset(\"Facebook\")\n",
+    "Facebook_dataset = DataProvider().get_datasets()\n",
+    "Facebook_graph = DataProvider().load_dataset(Facebook_dataset[0])"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -85,13 +148,8 @@
     "# Define a hyper-class to load the embedding method\n",
     "EmbMethodClass = load_method(\"struc2vec\")\n",
     "\n",
-    "# Get airports datasets\n",
-    "AirportDataProvider = load_dataset(\"airports\")\n",
-    "airport_datasets = AirportDataProvider().get_datasets()\n",
-    "brazil_airport_graph = AirportDataProvider().load_dataset(airport_datasets[0])\n",
-    "\n",
     "# Call the embedding method with the graph for initialization\n",
-    "struc2vec = EmbMethodClass(brazil_airport_graph, \n",
+    "struc2vec = EmbMethodClass(dataset_graph, \n",
     "                           num_walks=10, \n",
     "                           walk_length=80, \n",
     "                           window_size=10, \n",
@@ -106,7 +164,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
@@ -126,13 +184,14 @@
        " 'opt3': False}"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# This shows the tunable parameters for the certain embedding method\n",
+    "# This shows the tunable hyper-parameters for the certain embedding method\n",
+    "# Here, for example, list the tunable hyper-parameters for struc2vec\n",
     "EmbMethodClass.__PARAMS__"
    ]
   },
@@ -145,7 +204,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -163,7 +222,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -185,45 +244,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'overall': {'accuracy': {'mean': 0.7633, 'std': 0.0787},\n",
-       "  'f1_macro': {'mean': 0.7548, 'std': 0.0765},\n",
-       "  'f1_micro': {'mean': 0.7633, 'std': 0.0787},\n",
-       "  'auc_micro': {'mean': 0.9182, 'std': 0.0327},\n",
-       "  'auc_macro': {'mean': 0.9224, 'std': 0.0301}},\n",
-       " 'detailed': {0: {'accuracy': 0.7778,\n",
-       "   'f1_macro': 0.7515,\n",
-       "   'f1_micro': 0.7778,\n",
-       "   'auc_micro': 0.9204,\n",
-       "   'auc_macro': 0.9298},\n",
-       "  1: {'accuracy': 0.6154,\n",
-       "   'f1_macro': 0.6209,\n",
-       "   'f1_micro': 0.6154,\n",
-       "   'auc_micro': 0.858,\n",
-       "   'auc_macro': 0.866},\n",
+       "{'overall': {'accuracy': {'mean': 0.786, 'std': 0.0759},\n",
+       "  'f1_macro': {'mean': 0.7791, 'std': 0.0752},\n",
+       "  'f1_micro': {'mean': 0.786, 'std': 0.0759},\n",
+       "  'auc_micro': {'mean': 0.9288, 'std': 0.0255},\n",
+       "  'auc_macro': {'mean': 0.9413, 'std': 0.0182}},\n",
+       " 'detailed': {0: {'accuracy': 0.8148,\n",
+       "   'f1_macro': 0.805,\n",
+       "   'f1_micro': 0.8148,\n",
+       "   'auc_micro': 0.9374,\n",
+       "   'auc_macro': 0.9418},\n",
+       "  1: {'accuracy': 0.6538,\n",
+       "   'f1_macro': 0.6542,\n",
+       "   'f1_micro': 0.6538,\n",
+       "   'auc_micro': 0.8817,\n",
+       "   'auc_macro': 0.9083},\n",
        "  2: {'accuracy': 0.7692,\n",
        "   'f1_macro': 0.7448,\n",
        "   'f1_micro': 0.7692,\n",
-       "   'auc_micro': 0.9413,\n",
-       "   'auc_macro': 0.926},\n",
-       "  3: {'accuracy': 0.8462,\n",
-       "   'f1_macro': 0.8421,\n",
-       "   'f1_micro': 0.8462,\n",
-       "   'auc_micro': 0.9527,\n",
-       "   'auc_macro': 0.9561},\n",
+       "   'auc_micro': 0.9438,\n",
+       "   'auc_macro': 0.9578},\n",
+       "  3: {'accuracy': 0.8846,\n",
+       "   'f1_macro': 0.8769,\n",
+       "   'f1_micro': 0.8846,\n",
+       "   'auc_micro': 0.9556,\n",
+       "   'auc_macro': 0.9585},\n",
        "  4: {'accuracy': 0.8077,\n",
        "   'f1_macro': 0.8148,\n",
        "   'f1_micro': 0.8077,\n",
-       "   'auc_micro': 0.9186,\n",
-       "   'auc_macro': 0.9339}}}"
+       "   'auc_micro': 0.9255,\n",
+       "   'auc_macro': 0.9401}}}"
       ]
      },
-     "execution_count": 31,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -241,7 +300,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
@@ -255,10 +314,10 @@
     {
      "data": {
       "text/plain": [
-       "{'overall': {'purity': [0.6412213740458015], 'nmi': [0.4771373196787525]}}"
+       "{'overall': {'purity': [0.6793893129770993], 'nmi': [0.4854751062047489]}}"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -276,12 +335,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 28,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9379255572546902"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "from semb.evaluations.centrality_correlation import *\n",
-    "centrality_correlation(brazil_airport_graph, \n",
+    "centrality_correlation(dataset_graph, \n",
     "                       dict_struc2vec_emb, \n",
     "                       centrality='clustering_coeff', \n",
     "                       similarity='euclidean')"
 
@@ -0,0 +1,36 @@
+from semb.datasets import BaseDataset, DatasetInfo
+
+import os
+import networkx as nx
+from typing import List
+
+# TODO: Make this a remote URL in the future
+SAMPLE_DATA_DIR = os.path.join(os.path.dirname(__file__), "../../../sample-data/BlogCatalog")
+
+class Dataset(BaseDataset):
+    
+    def get_id(self) -> str:
+        return 'BlogCatalog'
+
+    def get_datasets(self) -> List[DatasetInfo]:
+        return [
+            DatasetInfo(name="BlogCatalog", description="BlogCatalog data", \
+                    src_url=f'{SAMPLE_DATA_DIR}/BlogCatalog.edgelist')]
+
+    def load_dataset(self, dataset: DatasetInfo, directed=False, weighted=False) -> nx.Graph:
+        if weighted:
+            graph = nx.read_edgelist(
+                dataset.src_url, 
+                nodetype=int, 
+                data=(('weight', 'data')), 
+                create_using=(nx.Graph() if not directed else nx.DiGraph()))
+        else:
+            graph = nx.read_edgelist(
+                dataset.src_url,
+                nodetype=int,
+                create_using=(nx.Graph() if not directed else nx.DiGraph()))
+            for edge in graph.edges():
+                graph[edge[0]][edge[1]]['weight'] = 1
+
+        return graph
+            
@@ -0,0 +1,36 @@
+from semb.datasets import BaseDataset, DatasetInfo
+
+import os
+import networkx as nx
+from typing import List
+
+# TODO: Make this a remote URL in the future
+SAMPLE_DATA_DIR = os.path.join(os.path.dirname(__file__), "../../../sample-data/DD6")
+
+class Dataset(BaseDataset):
+    
+    def get_id(self) -> str:
+        return 'DD6'
+
+    def get_datasets(self) -> List[DatasetInfo]:
+        return [
+            DatasetInfo(name="DD6", description="DD6 dataset", \
+                    src_url=f'{SAMPLE_DATA_DIR}/DD6.edgelist')]
+
+    def load_dataset(self, dataset: DatasetInfo, directed=False, weighted=False) -> nx.Graph:
+        if weighted:
+            graph = nx.read_edgelist(
+                dataset.src_url, 
+                nodetype=int, 
+                data=(('weight', 'data')), 
+                create_using=(nx.Graph() if not directed else nx.DiGraph()))
+        else:
+            graph = nx.read_edgelist(
+                dataset.src_url,
+                nodetype=int,
+                create_using=(nx.Graph() if not directed else nx.DiGraph()))
+            for edge in graph.edges():
+                graph[edge[0]][edge[1]]['weight'] = 1
+
+        return graph
+            
@@ -0,0 +1,36 @@
+from semb.datasets import BaseDataset, DatasetInfo
+
+import os
+import networkx as nx
+from typing import List
+
+# TODO: Make this a remote URL in the future
+SAMPLE_DATA_DIR = os.path.join(os.path.dirname(__file__), "../../../sample-data/Facebook")
+
+class Dataset(BaseDataset):
+    
+    def get_id(self) -> str:
+        return 'Facebook'
+
+    def get_datasets(self) -> List[DatasetInfo]:
+        return [
+            DatasetInfo(name="Facebook", description="Facebook dataset", \
+                    src_url=f'{SAMPLE_DATA_DIR}/Facebook.edgelist')]
+
+    def load_dataset(self, dataset: DatasetInfo, directed=False, weighted=False) -> nx.Graph:
+        if weighted:
+            graph = nx.read_edgelist(
+                dataset.src_url, 
+                nodetype=int, 
+                data=(('weight', 'data')), 
+                create_using=(nx.Graph() if not directed else nx.DiGraph()))
+        else:
+            graph = nx.read_edgelist(
+                dataset.src_url,
+                nodetype=int,
+                create_using=(nx.Graph() if not directed else nx.DiGraph()))
+            for edge in graph.edges():
+                graph[edge[0]][edge[1]]['weight'] = 1
+
+        return graph
+