wip: adding dataset iterator class

PJEstrada · PJEstrada · commit c9e953cbbb55 · 2021-08-09T14:21:58.000-06:00
diff --git a/pytorch_test.py b/pytorch_test.py
@@ -23,11 +23,17 @@
 img = Image.new("L", [diffgram_dataset[0]['diffgram_file'].image['width'], diffgram_dataset[0]['diffgram_file'].image['height']], 0)
 mask1 = diffgram_dataset[0]['polygon_mask_list'][0]
 mask2 = diffgram_dataset[0]['polygon_mask_list'][1]
-print(mask1)
-for x in mask1:
-    print(x)
 plt.figure()
 plt.subplot(1,2,1)
 # plt.imshow(img, 'gray', interpolation='none')
 plt.imshow(mask1, 'jet', interpolation='none', alpha=0.7)
-plt.imshow(mask2, 'Oranges', interpolation='none', alpha=0.7)
+plt.imshow(mask2, 'Oranges', interpolation='none', alpha=0.7)
+plt.show()
+
+
+# Dataset Example
+
+dataset = project.directory.get('Default')
+
+sliced_dataset = dataset.slice(query = 'labels.sheep  > 0 or labels.sofa > 0')
+
diff --git a/sdk/diffgram/core/diffgram_dataset_iterator.py b/sdk/diffgram/core/diffgram_dataset_iterator.py
@@ -0,0 +1,129 @@
+from PIL import Image, ImageDraw
+from imageio import imread
+
+
+class DiffgramDatasetIterator:
+
+    def __init__(self, project, diffgram_file_id_list):
+        """
+
+        :param project (sdk.core.core.Project): A Project object from the Diffgram SDK
+        :param diffgram_file_list (list): An arbitrary number of file ID's from Diffgram.
+        """
+        self.diffgram_file_id_list = diffgram_file_id_list
+
+        self.project = project
+        self._internal_file_list = []
+        self.__validate_file_ids()
+        self.current_file_index = 0
+
+    def __iter__(self):
+        self.current_file_index = 0
+        return self
+
+    def __next__(self):
+        file_id = self.diffgram_file_id_list[self.current_file_index]
+        diffgram_file = self.project.file.get_by_id(file_id, with_instances = True)
+        instance_data = self.get_file_instances(diffgram_file)
+        self.current_file_index += 1
+        return instance_data
+
+    def __validate_file_ids(self):
+        result = self.project.file.file_list_exists(self.diffgram_file_id_list)
+        if not result:
+            raise Exception(
+                'Some file IDs do not belong to the project. Please provide only files from the same project.')
+
+    def get_image_data(self, diffgram_file):
+        if hasattr(diffgram_file, 'image'):
+            image = imread(diffgram_file.image.get('url_signed'))
+            return image
+        else:
+            raise Exception('Pytorch datasets only support images. Please provide only file_ids from images')
+
+    def get_file_instances(self, diffgram_file):
+        if diffgram_file['type'] not in ['image', 'frame']:
+            raise NotImplementedError('File type "{}" is not supported yet'.format(diffgram_file['type']))
+
+        image = self.get_image_data(diffgram_file)
+        instance_list = diffgram_file.instance_list
+        instance_types_in_file = set([x['type'] for x in instance_list])
+        # Process the instances of each file
+        sample = {'image': image, 'diffgram_file': diffgram_file}
+        has_boxes = False
+        has_poly = False
+        if 'box' in instance_types_in_file:
+            has_boxes = True
+            x_min_list, x_max_list, y_min_list, y_max_list = self.extract_bbox_values(instance_list, diffgram_file)
+            sample['x_min_list'] = x_min_list
+            sample['x_max_list'] = x_max_list
+            sample['y_min_list'] = y_min_list
+            sample['y_max_list'] = y_max_list
+
+        if 'polygon' in instance_types_in_file:
+            has_poly = True
+            mask_list = self.extract_masks_from_polygon(instance_list, diffgram_file)
+            sample['polygon_mask_list'] = mask_list
+
+        if len(instance_types_in_file) > 2 and has_boxes and has_boxes:
+            raise NotImplementedError(
+                'SDK only supports boxes and polygon types currently. If you want a new instance type to be supported please contact us!'
+            )
+
+        label_id_list, label_name_list = self.extract_labels(instance_list)
+        sample['label_id_list'] = label_id_list
+        sample['label_name_list'] = label_name_list
+
+        return sample
+
+    def extract_masks_from_polygon(self, instance_list, diffgram_file, empty_value = 0):
+        nx, ny = diffgram_file.image['width'], diffgram_file.image['height']
+        mask_list = []
+        for instance in instance_list:
+            if instance['type'] != 'polygon':
+                continue
+            poly = [(p['x'], p['y']) for p in instance['points']]
+
+            img = Image.new(mode = 'L', size = (nx, ny), color = 0)  # mode L = 8-bit pixels, black and white
+            draw = ImageDraw.Draw(img)
+            draw.polygon(poly, outline = 1, fill = 1)
+            mask = np.array(img).astype('float32')
+            # mask[np.where(mask == 0)] = empty_value
+            mask_list.append(mask)
+        return mask_list
+
+    def extract_labels(self, instance_list, allowed_instance_types = None):
+        label_file_id_list = []
+        label_names_list = []
+
+        for inst in instance_list:
+            if allowed_instance_types and inst['type'] in allowed_instance_types:
+                continue
+
+            label_file_id_list.append(inst['label_file']['id'])
+            label_names_list.append(inst['label_file']['label']['name'])
+
+        return label_file_id_list, label_names_list
+
+    def extract_bbox_values(self, instance_list, diffgram_file):
+        """
+            Creates a pytorch tensor based on the instance type.
+            For now we are assuming shapes here, but we can extend it
+            to accept custom shapes specified by the user.
+        :param instance:
+        :return:
+        """
+        x_min_list = []
+        x_max_list = []
+        y_min_list = []
+        y_max_list = []
+
+        for inst in instance_list:
+            if inst['type'] != 'box':
+                continue
+            x_min_list.append(inst['x_min'] / diffgram_file.image['width'])
+            x_max_list.append(inst['x_max'] / diffgram_file.image['width'])
+            y_min_list.append(inst['y_min'] / diffgram_file.image['width'])
+            y_max_list.append(inst['y_max'] / diffgram_file.image['width'])
+
+        return x_min_list, x_max_list, y_min_list, y_max_list
diff --git a/sdk/diffgram/core/directory.py b/sdk/diffgram/core/directory.py
@@ -109,6 +109,7 @@ def slice(self, query):
 			file_view_mode = 'ids_only'
 		)
 		sliced_dataset = SlicedDirectory(
+			client = self.client,
 			query = query,
 			original_directory = self
 		)
diff --git a/sdk/diffgram/pytorch_diffgram/__pycache__/diffgram_pytorch_dataset.cpython-38.pyc b/sdk/diffgram/pytorch_diffgram/__pycache__/diffgram_pytorch_dataset.cpython-38.pyc
diff --git a/sdk/diffgram/pytorch_diffgram/diffgram_pytorch_dataset.py b/sdk/diffgram/pytorch_diffgram/diffgram_pytorch_dataset.py
@@ -1,13 +1,12 @@
-from torch.utils.data import Dataset, DataLoader
-import torch
 import os
-from imageio import imread
+
 import numpy as np
 import scipy as sp
-from PIL import Image, ImageDraw
+
+from diffgram.core.diffgram_dataset_iterator import DiffgramDatasetIterator
 
 
-class DiffgramPytorchDataset(Dataset):
+class DiffgramPytorchDataset(DiffgramDatasetIterator, Dataset):
 
     def __init__(self, project, diffgram_file_id_list = None, transform = None):
         """
@@ -16,60 +15,21 @@ def __init__(self, project, diffgram_file_id_list = None, transform = None):
         :param diffgram_file_list (list): An arbitrary number of file ID's from Diffgram.
         :param transform (callable, optional): Optional transforms to be applied on a sample
         """
+        super(DiffgramDatasetIterator, self).__init__(project, diffgram_file_id_list)
+        global torch, Dataset, DataLoader
+        try:
+            import torch as torch  # type: ignore
+            from torch.utils.data import Dataset, DataLoader
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                "'torch' module should be installed to convert the Dataset into pytorch format"
+            )
         self.diffgram_file_id_list = diffgram_file_id_list
 
         self.project = project
         self.transform = transform
-        self._internal_file_list = []
         self.__validate_file_ids()
 
-    def __validate_file_ids(self):
-        result = self.project.file.file_list_exists(self.diffgram_file_id_list)
-        if not result:
-            raise Exception(
-                'Some file IDs do not belong to the project. Please provide only files from the same project.')
-
-    def __extract_masks_from_polygon(self, instance_list, diffgram_file, empty_value = 0):
-        nx, ny = diffgram_file.image['width'], diffgram_file.image['height']
-        mask_list = []
-        for instance in instance_list:
-            if instance['type'] != 'polygon':
-                continue
-            poly = [(p['x'], p['y']) for p in instance['points']]
-
-            img = Image.new(mode = 'L', size = (nx, ny), color = 0)  # mode L = 8-bit pixels, black and white
-            draw = ImageDraw.Draw(img)
-            print()
-            draw.polygon(poly, outline = 1, fill = 1)
-            mask = np.array(img).astype('float32')
-            # mask[np.where(mask == 0)] = empty_value
-            print('mask', len(mask))
-            mask_list.append(mask)
-        return mask_list
-
-    def __extract_bbox_values(self, instance_list, diffgram_file):
-        """
-            Creates a pytorch tensor based on the instance type.
-            For now we are assuming shapes here, but we can extend it
-            to accept custom shapes specified by the user.
-        :param instance:
-        :return:
-        """
-        x_min_list = []
-        x_max_list = []
-        y_min_list = []
-        y_max_list = []
-
-        for inst in instance_list:
-            if inst['type'] != 'box':
-                continue
-            x_min_list.append(inst['x_min'] / diffgram_file.image['width'])
-            x_max_list.append(inst['x_max'] / diffgram_file.image['width'])
-            y_min_list.append(inst['y_min'] / diffgram_file.image['width'])
-            y_max_list.append(inst['y_max'] / diffgram_file.image['width'])
-
-        return x_min_list, x_max_list, y_min_list, y_max_list
-
     def __len__(self):
         return len(self.diffgram_file_id_list)
 
@@ -81,25 +41,17 @@ def __getitem__(self, idx):
             idx = idx.tolist()
 
         diffgram_file = self.project.file.get_by_id(self.diffgram_file_id_list[idx], with_instances = True)
-        if hasattr(diffgram_file, 'image'):
-            image = imread(diffgram_file.image.get('url_signed'))
-        else:
-            raise Exception('Pytorch datasets only support images. Please provide only file_ids from images')
 
-        instance_list = diffgram_file.instance_list
-        instance_types_in_file = set([x['type'] for x in instance_list])
-        # Process the instances of each file
-        processed_instance_list = []
-        sample = {'image': image, 'diffgram_file': diffgram_file}
-        if 'box' in instance_types_in_file:
-            x_min_list, x_max_list, y_min_list, y_max_list = self.__extract_bbox_values(instance_list, diffgram_file)
-            sample['x_min_list'] = torch.Tensor(x_min_list)
-            sample['x_max_list'] = torch.Tensor(x_max_list)
-            sample['y_min_list'] = torch.Tensor(y_min_list)
-            sample['y_max_list'] = torch.Tensor(y_max_list)
-        if 'polygon' in instance_types_in_file:
-            mask_list = self.__extract_masks_from_polygon(instance_list, diffgram_file)
-            sample['polygon_mask_list'] = mask_list
+        sample = self.get_file_instances(diffgram_file)
+        if 'x_min_list' in sample:
+            sample['x_min_list'] = torch.Tensor(sample['x_min_list'])
+        if 'x_max_list' in sample:
+            sample['x_max_list'] = torch.Tensor(sample['x_max_list'])
+        if 'y_min_list' in sample:
+            sample['y_min_list'] = torch.Tensor(sample['y_min_list'])
+        if 'y_max_list' in sample:
+            sample['y_max_list'] = torch.Tensor(sample['y_max_list'])
+
         if self.transform:
             sample = self.transform(sample)
 
diff --git a/sdk/diffgram/tensorflow_diffgram/__init__.py b/sdk/diffgram/tensorflow_diffgram/__init__.py
diff --git a/sdk/diffgram/tensorflow_diffgram/diffgram_tensorflow_dataset.py b/sdk/diffgram/tensorflow_diffgram/diffgram_tensorflow_dataset.py
@@ -0,0 +1,80 @@
+from diffgram.core.diffgram_dataset_iterator import DiffgramDatasetIterator
+import os
+
+
+class DiffgramTensorflowDataset(DiffgramDatasetIterator):
+
+    def __init__(self, project, diffgram_file_id_list = None):
+        """
+
+        :param project (sdk.core.core.Project): A Project object from the Diffgram SDK
+        :param diffgram_file_list (list): An arbitrary number of file ID's from Diffgram.
+        :param transform (callable, optional): Optional transforms to be applied on a sample
+        """
+        super(DiffgramDatasetIterator, self).__init__(project, diffgram_file_id_list)
+        global tf
+        try:
+            import tensorflow as tf  # type: ignore
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                "'tensorflow' module should be installed to convert the Dataset into tensorflow format"
+            )
+        self.diffgram_file_id_list = diffgram_file_id_list
+
+        self.project = project
+        self.__validate_file_ids()
+
+    def int64_feature(self, value):
+        return tf.train.Feature(int64_list = tf.train.Int64List(value = [value]))
+
+    def int64_list_feature(self, value):
+        return tf.train.Feature(int64_list = tf.train.Int64List(value = value))
+
+    def bytes_feature(self, value):
+        return tf.train.Feature(bytes_list = tf.train.BytesList(value = [value]))
+
+    def bytes_list_feature(self, value):
+        return tf.train.Feature(bytes_list = tf.train.BytesList(value = value))
+
+    def float_feature(self, value):
+        return tf.train.Feature(float_list = tf.train.FloatList(value = [value]))
+
+    def float_list_feature(self, value):
+        return tf.train.Feature(float_list = tf.train.FloatList(value = value))
+
+    def __validate_file_ids(self):
+        result = self.project.file.file_list_exists(self.diffgram_file_id_list)
+        if not result:
+            raise Exception(
+                'Some file IDs do not belong to the project. Please provide only files from the same project.')
+
+    def __iter__(self):
+        self.current_file_index = 0
+        return self
+
+    def __next__(self):
+        file_id = self.diffgram_file_id_list[self.current_file_index]
+        diffgram_file = self.project.file.get_by_id(file_id, with_instances = True)
+        instance_data = self.get_file_instances(diffgram_file)
+        filename, file_extension = os.path.splitext(instance_data['diffgram_file']['image']['original_filename'])
+        print('instance_data', instance_data)
+        tf_example_dict = {
+            'image/height': self.int64_feature(instance_data['diffgram_file']['height']),
+            'image/width': self.int64_feature(instance_data['diffgram_file']['width']),
+            'image/filename': self.bytes_feature(filename),
+            'image/source_id': self.bytes_feature(filename),
+            'image/encoded': self.bytes_feature(instance_data['image']),
+            'image/format': self.bytes_feature(file_extension),
+            'image/object/bbox/xmin': self.float_list_feature(instance_data['x_min_list']),
+            'image/object/bbox/xmax': self.float_list_feature(instance_data['x_max_list']),
+            'image/object/bbox/ymin': self.float_list_feature(instance_data['y_min_list']),
+            'image/object/bbox/ymax': self.float_list_feature(instance_data['y_max_list']),
+            'image/object/class/text': self.bytes_list_feature(instance_data['label_name_list']),
+            'image/object/class/label': self.int64_list_feature(instance_data['label_id_list']),
+        }
+        tf_example = tf.train.Example(features = tf.train.Features(feature = tf_example_dict))
+        self.current_file_index += 1
+        return tf_example
+
+    def get_dataset_obj(self):
+        return tf.data.Dataset.from_generator(self.__iter__)

Original file line number	Diff line number	Diff line change
`@@ -109,6 +109,7 @@ def slice(self, query):`
`109`	`109`	`file_view_mode = 'ids_only'`
`110`	`110`	`)`
`111`	`111`	`sliced_dataset = SlicedDirectory(`
	`112`	`+ client = self.client,`
`112`	`113`	`query = query,`
`113`	`114`	`original_directory = self`
`114`	`115`	`)`