feat: initial support to export to pytorch

PJEstrada · PJEstrada · commit b9e032c0db16 · 2021-06-30T14:32:31.000-06:00
Gives users the ability to export any dataset into a pytorch dataset. Pending support for other instance types different from boxes and video support.
diff --git a/sdk/add_file_id_to_json.py b/sdk/add_file_id_to_json.py
@@ -0,0 +1,46 @@
+from diffgram.core.core import Project
+import json
+
+project = Project(project_string_id = "coco-dataset",
+                  debug = True,
+                  client_id = "LIVE__rj6whqkwxkups7oczqis",
+                  client_secret = "fr5vy64v2096qad9av0dgw3fr0kjavt4c156soiwx51ntyv9qswpuxkhg0lf")
+
+
+def find_file(file_list, name):
+    for f in file_list:
+        if f.original_filename == name:
+            return f
+    return None
+
+
+with open('/home/pablo/Downloads/coco2017.json') as json_file:
+    data = json.load(json_file)
+
+    dataset_default = project.directory.get(name = "Default")
+
+    page_num = 1
+    all_files = []
+    print('start')
+    while page_num != None:
+        print('Current page', page_num)
+        diffgram_files = dataset_default.list_files(limit = 1000, page_num = page_num, file_view_mode = 'base')
+        page_num = dataset_default.file_list_metadata['next_page']
+        print('{} of {}'.format(page_num, dataset_default.file_list_metadata['total_pages']))
+        all_files = all_files + diffgram_files
+
+    print('')
+    print('Files fetched: ', len(all_files))
+    result = []
+    for elm in data:
+        file = find_file(all_files, name = elm['image_name'])
+        if file:
+            print('Adding file ID {} to {}'.format(file.id, elm['image_name']))
+            elm['file_id'] = file.id
+            result.append(elm)
+        else:
+            print(elm['image_name'], 'not found.')
+
+    s = json.dumps(result).
+    f = open('/home/pablo/Downloads/coco2017_with_ids.json', 'w')
+    f.write(s)
diff --git a/sdk/diffgram/core/directory.py b/sdk/diffgram/core/directory.py
@@ -1,7 +1,7 @@
 from diffgram.file.file import File
 from ..regular.regular import refresh_from_dict
 import logging
-
+from diffgram.pytorch_diffgram.diffgram_pytorch_dataset import DiffgramPytorchDataset
 
 def get_directory_list(self):
 	"""
@@ -78,6 +78,34 @@ def __init__(self,
 		self.id = None
 		self.file_list_metadata = {}
 
+	def all_files(self):
+		"""
+			Get all the files of the directoy.
+			Warning! This can be an expensive function and take a long time.
+		:return:
+		"""
+		page_num = 1
+		result = []
+		while page_num is not None:
+			diffgram_files = self.list_files(limit = 1000, page_num = page_num, file_view_mode = 'base')
+			page_num = self.file_list_metadata['next_page']
+			result = result + diffgram_files
+		return result
+
+	def to_pytorch(self, transform = None):
+		"""
+			Transforms the file list inside the dataset into a pytorch dataset.
+		:return:
+		"""
+		dataset_files = self.all_files()
+		file_id_list = [file.id for file in dataset_files]
+		pytorch_dataset = DiffgramPytorchDataset(
+			project = self.client,
+			diffgram_file_id_list = file_id_list,
+			transform = transform
+
+		)
+		return pytorch_dataset
 
 	def new(self, name: str):
 		"""
diff --git a/sdk/diffgram/file/file.py b/sdk/diffgram/file/file.py
@@ -1,6 +1,5 @@
 from ..regular.regular import refresh_from_dict
 
-
 class File():
     """
     file literal object
@@ -11,11 +10,12 @@ class File():
 
     def __init__(
             self,
-            id=None,
-            client=None):
+            id = None,
+            client = None):
         self.id = id
         self.client = client
 
+    @staticmethod
     def new(
             client,
             file_json):
diff --git a/sdk/diffgram/file/file_constructor.py b/sdk/diffgram/file/file_constructor.py
@@ -414,29 +414,42 @@ def import_bulk():
 
 
 	def get_by_id(self, 
-				  id: int):
+				  id: int,
+				  with_instances: bool = False):
 		"""
 		returns Diffgram File object
 		"""
-	
-		endpoint = "/api/v1/file/view"
 
-		spec_dict = {
-			'file_id': id,
-			'project_string_id': self.client.project_string_id
+		if not with_instances:
+			endpoint = "/api/v1/file/view"
+
+			spec_dict = {
+				'file_id': id,
+				'project_string_id': self.client.project_string_id,
+				}
+
+
+			file_response_key = 'file'
+
+		else:
+			endpoint = "/api/project/{}/file/{}/annotation/list".format(self.client.project_string_id, id)
+			spec_dict = {
+				'directory_id': self.client.directory_id
 			}
+			file_response_key = 'file_serialized'
 
 		response = self.client.session.post(
 			self.client.host + endpoint,
 			json = spec_dict)
-		
+
 		self.client.handle_errors(response)
 
 		response_json = response.json()
+		file_data = response_json.get(file_response_key)
 
 		return File.new(
 			client = self.client,
-			file_json = response_json.get('file'))
+			file_json = file_data)
 
 
 
diff --git a/sdk/diffgram/pytorch_diffgram/__init__.py b/sdk/diffgram/pytorch_diffgram/__init__.py
diff --git a/sdk/diffgram/pytorch_diffgram/diffgram_pytorch_dataset.py b/sdk/diffgram/pytorch_diffgram/diffgram_pytorch_dataset.py
@@ -0,0 +1,59 @@
+from torch.utils.data import Dataset, DataLoader
+import torch
+import os
+from imageio import imread
+import numpy as np
+
+
+class DiffgramPytorchDataset(Dataset):
+
+    def __init__(self, project, diffgram_file_id_list, transform = None):
+        """
+
+        :param project (sdk.core.core.Project): A Project object from the Diffgram SDK
+        :param diffgram_file_list (list): An arbitrary number of file ID's from Diffgram.
+        :param transform (callable, optional): Optional transforms to be applied on a sample
+        """
+        self.diffgram_file_id_list = diffgram_file_id_list
+        self.project = project
+        self.transform = transform
+
+    def __process_instance(self, instance):
+        """
+            Creates a pytorch tensor based on the instance type.
+            For now we are assuming shapes here, but we can extend it
+            to accept custom shapes specified by the user.
+        :param instance:
+        :return:
+        """
+        if instance['type'] == 'box':
+            result = np.array([instance['x_min'], instance['y_min'], instance['x_max'], instance['y_max']])
+            result = torch.tensor(result)
+        return result
+
+    def __len__(self):
+        return len(self.diffgram_file_id_list)
+
+    def __getitem__(self, idx):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+
+        diffgram_file = self.project.file.get_by_id(idx, with_instances = True)
+        if hasattr(diffgram_file, 'image'):
+            image = imread(diffgram_file.image.get('url_signed'))
+        else:
+            raise Exception('Pytorch datasets only support images. Please provide only file_ids from images')
+
+        instance_list = diffgram_file.instance_list
+
+        # Process the instances of each file
+        processed_instance_list = []
+        for instance in instance_list:
+            instnace_tensor = self.__process_instance(instance)
+            processed_instance_list.append(instnace_tensor)
+        sample = {'image': image, 'instance_list': instance_list}
+
+        if self.transform:
+            sample = self.transform(sample)
+
+        return sample
diff --git a/sdk/requirements.txt b/sdk/requirements.txt
@@ -3,4 +3,6 @@ opencv-python>=4.0.0.21
 scipy>=1.1.0
 six>=1.9.0
 tensorflow>=1.12.0
-pillow
+pillow
+torch
+imageio