docs(process): updates.

Kin-Zhang · Kin-Zhang · commit 2b4b2725bc51 · 2024-11-18T12:06:24.000+01:00
Merge branch 'SeFlow' into DeFlow
diff --git a/dataprocess/README.md b/dataprocess/README.md
@@ -12,47 +12,47 @@ We've updated the process dataset for:
 - [x] Waymo: check [here](#waymo-dataset). The process script was involved from [SeFlow](https://github.com/KTH-RPL/SeFlow).
 - [ ] nuScenes: done coding, public after review. Will be involved later by another paper.
 
-If you want to use all datasets above, there is a specific process environment in [envprocess.yml](../envprocess.yml) to install all the necessary packages. As Waymo package have different configuration and conflict with the main environment. Setup through the following command:
+If you want to use all datasets above, there is a specific process environment in [envprocess.yaml](../envprocess.yaml) to install all the necessary packages. As Waymo package have different configuration and conflict with the main environment. Setup through the following command:
 
 ```bash
-conda env create -f envprocess.yml
+conda env create -f envprocess.yaml
 conda activate dataprocess
+# NOTE we need **manually reinstall numpy** (higher than 1.22)
+# * since waymo package force numpy==1.21.5, BUT!
+# * hdbscan w. numpy<1.22.0 will raise error: 'numpy.float64' object cannot be interpreted as an integer
+# * av2 need numpy >=1.22.0, waymo with numpy==1.22.0 will be fine on code running.
+pip install numpy==1.22
 ```
 
 ## Download
 
 ### Argoverse 2.0
 
-Install their download tool:
-```bash
-mamba install s5cmd -c conda-forge
-```
-
-Download the dataset:
+Install their download tool `s5cmd`, already in our envprocess.yaml. Then download the dataset:
 ```bash
 # train is really big (750): totally 966 GB
-s5cmd --no-sign-request cp "s3://argoverse/datasets/av2/sensor/train/*" sensor/train
+s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/sensor/train/*" av2/sensor/train 
 
 # val (150) and test (150): totally 168GB + 168GB
-s5cmd --no-sign-request cp "s3://argoverse/datasets/av2/sensor/val/*" sensor/val
-s5cmd --no-sign-request cp "s3://argoverse/datasets/av2/sensor/test/*" sensor/test
+s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/sensor/val/*" av2/sensor/val
+s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/sensor/test/*" av2/sensor/test
 
 # for local and online eval mask from official repo
 s5cmd --no-sign-request cp "s3://argoverse/tasks/3d_scene_flow/zips/*" .
 ```
 
 Then to quickly pre-process the data, we can [read these commands](#process) on how to generate the pre-processed data for training and evaluation. This will take around 0.5-2 hour for the whole dataset (train & val) based on how powerful your CPU is.
 
-More [self-supervised data in AV2 LiDAR only](https://www.argoverse.org/av2.html#lidar-link), note: It **does not** include **imagery or 3D annotations**. The dataset is designed to support research into self-supervised learning in the lidar domain, as well as point cloud forecasting.
+Optional: More [self-supervised data in AV2 LiDAR only](https://www.argoverse.org/av2.html#lidar-link), note: It **does not** include **imagery or 3D annotations**. The dataset is designed to support research into self-supervised learning in the lidar domain, as well as point cloud forecasting. 
 ```bash
 # train is really big (16000): totally 4 TB
-s5cmd --no-sign-request cp "s3://argoverse/datasets/av2/lidar/train/*" lidar/train
+s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/lidar/train/*" av2/lidar/train
 
 # val (2000): totally 0.5 TB
-s5cmd --no-sign-request cp "s3://argoverse/datasets/av2/lidar/val/*" lidar/val
+s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/lidar/val/*" av2/lidar/val
 
 # test (2000): totally 0.5 TB
-s5cmd --no-sign-request cp "s3://argoverse/datasets/av2/lidar/test/*" lidar/test
+s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/lidar/test/*" av2/lidar/test
 ``` 
 
 #### Dataset frames
diff --git a/dataprocess/extract_waymo.py b/dataprocess/extract_waymo.py
@@ -346,7 +346,6 @@ def create_group_data(group, pc, pose, gm = None, flow_0to1=None, flow_valid=Non
     first_frame = dataset_pb2.Frame.FromString(bytearray(all_data[0]))
     scene_id = first_frame.context.name
     total_lens = len(all_data)
-    # for data_idx in tqdm(range(1, total_lens), ncols=100):
     for data_idx in range(1, total_lens):
         if data_idx >= total_lens - 2:
             # 0: no correct flow label, end(total_lens - 1) - 1: no correct pose flow
@@ -384,7 +383,6 @@ def process_logs(data_dir: Path, map_dir: Path, output_dir: Path, nproc: int):
          data_dir: Argoverse 2.0 directory
          output_dir: Output directory.
     """
-    
     if not (data_dir).exists():
         print(f'{data_dir} not found')
         return
@@ -408,7 +406,7 @@ def process_logs(data_dir: Path, map_dir: Path, output_dir: Path, nproc: int):
 def main(
     flow_data_dir: str = "/home/kin/data/waymo/flowlabel",
     mode: str = "test",
-    map_dir: str = "/home/kin/data/waymo/flowlabel/maps",
+    map_dir: str = "/home/kin/data/waymo/flowlabel/map",
     output_dir: str ="/home/kin/data/waymo/flowlabel/preprocess",
     nproc: int = (multiprocessing.cpu_count() - 1),
     create_index_only: bool = False,
diff --git a/envprocess.yaml b/envprocess.yaml
@@ -6,23 +6,24 @@ dependencies:
   - python=3.8
   - pytorch::pytorch=2.0.0
   - pytorch::torchvision
+  - mkl==2024.0.0
   - numba
-  - numpy==1.22
+  - numpy
   - pandas
   - pip
   - scipy
   - tqdm
-  - scikit-learn
   - fire
+  - hdbscan
+  - s5cmd
   - pip:
     - nuscenes-devkit
     - av2==0.2.1
     - waymo-open-dataset-tf-2.11.0==1.5.0
-    - open3d==0.16.0
+    - dufomap==1.0.0
     - linefit
     - dztimer
-    - dufomap==1.0.0
-    - evalai
 
 # Reason about the version fixed:
-# numpy==1.22: package conflicts, need numpy higher or same 1.22
+# numpy==1.22: package conflicts, need numpy higher or same 1.22
+# mkl==2024.0.0: https://github.com/pytorch/pytorch/issues/123097
diff --git a/process.py b/process.py
@@ -0,0 +1,153 @@
+"""
+# Created: 2023-11-04 15:55
+# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
+# Author: Qingwen Zhang  (https://kin-zhang.github.io/)
+#
+# This file is part of SeFlow (https://github.com/KTH-RPL/SeFlow).
+# If you find this repo helpful, please cite the respective publication as 
+# listed on the above website.
+# 
+# Description: run dufomap on the dataset we preprocessed for afterward ssl training.
+#              it's only needed for ssl train but not inference.
+#              Goal to segment dynamic and static point roughly.
+"""
+
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+import fire, time, h5py, os
+from hdbscan import HDBSCAN
+
+from src.utils.mics import HDF5Data, transform_to_array
+from dufomap import dufomap
+
+MIN_AXIS_RANGE = 2 # HARD CODED: remove ego vehicle points
+MAX_AXIS_RANGE = 50 # HARD CODED: remove far away points
+
+def run_cluster(
+    data_dir: str ="/home/kin/data/av2/preprocess/sensor/train",
+    scene_range: list = [0, 1],
+    interval: int = 1, # useless here, just for the same interface args
+    overwrite: bool = False,
+):
+    data_path = Path(data_dir)
+    dataset = HDF5Data(data_path)
+    all_scene_ids = list(dataset.scene_id_bounds.keys())
+    for scene_in_data_index, scene_id in enumerate(all_scene_ids):
+        start_time = time.time()
+        # NOTE (Qingwen): so the scene id range is [start, end)
+        if scene_range[0]!= -1 and scene_range[-1]!= -1 and (scene_in_data_index < scene_range[0] or scene_in_data_index >= scene_range[1]):
+            continue
+        bounds = dataset.scene_id_bounds[scene_id]
+        flag_exist_label = True
+        with h5py.File(os.path.join(data_path, f'{scene_id}.h5'), 'r+') as f:
+            for ii in range(bounds["min_index"], bounds["max_index"]+1):
+                key = str(dataset[ii]['timestamp'])
+                if 'label' not in f[key]:
+                    flag_exist_label = False
+                    break
+        if flag_exist_label and not overwrite:
+            print(f"==> Scene {scene_id} has plus label, skip.")
+            continue
+        
+        hdb = HDBSCAN(min_cluster_size=20, cluster_selection_epsilon=0.7)
+        for i in tqdm(range(bounds["min_index"], bounds["max_index"]+1), desc=f"Start Plus Cluster: {scene_in_data_index}/{len(all_scene_ids)}", ncols=80):
+            data = dataset[i]
+            pc0 = data['pc0'][:,:3]
+            cluster_label = np.zeros(pc0.shape[0], dtype= np.int16)
+
+            if "dufo_label" not in data:
+                print(f"Warning: {scene_id} {data['timestamp']} has no dufo_label, will be skipped. Better to rerun dufomap again in this scene.")
+                continue
+            elif data["dufo_label"].sum() < 20:
+                print(f"Warning: {scene_id} {data['timestamp']} has no dynamic points, will be skipped. Better to check this scene.")
+            else:
+                hdb.fit(pc0[data["dufo_label"]==1])
+                # NOTE(Qingwen): since -1 will be assigned if no cluster. We set it to 0.
+                cluster_label[data["dufo_label"]==1] = hdb.labels_ + 1 
+
+            # save labels
+            timestamp = data['timestamp']
+            key = str(timestamp)
+            with h5py.File(os.path.join(data_path, f'{scene_id}.h5'), 'r+') as f:
+                if 'label' in f[key]:
+                    # print(f"Warning: {scene_id} {timestamp} has label, will be overwritten.")
+                    del f[key]['label']
+                f[key].create_dataset('label', data=np.array(cluster_label).astype(np.int16))
+        print(f"==> Scene {scene_id} finished, used: {(time.time() - start_time)/60:.2f} mins")
+    print(f"Data inside {str(data_path)} finished. Check the result with vis() function if you want to visualize them.")
+
+def run_dufo(
+    data_dir: str ="/home/kin/data/av2/preprocess/sensor/train",
+    scene_range: list = [0, 1],
+    interval: int = 1, # interval frames to run dufomap
+    overwrite: bool = False,
+):
+    data_path = Path(data_dir)
+    dataset = HDF5Data(data_path)
+    all_scene_ids = list(dataset.scene_id_bounds.keys())
+    for scene_in_data_index, scene_id in enumerate(all_scene_ids):
+        start_time = time.time()
+        # NOTE (Qingwen): so the scene id range is [start, end)
+        if scene_range[0]!= -1 and scene_range[-1]!= -1 and (scene_in_data_index < scene_range[0] or scene_in_data_index >= scene_range[1]):
+            continue
+        bounds = dataset.scene_id_bounds[scene_id]
+        flag_has_dufo_label = True
+        with h5py.File(os.path.join(data_path, f'{scene_id}.h5'), 'r+') as f:
+            for ii in range(bounds["min_index"], bounds["max_index"]+1):
+                key = str(dataset[ii]['timestamp'])
+                if "dufo_label" not in f[key]:
+                    flag_has_dufo_label = False
+                    break
+        if flag_has_dufo_label and not overwrite:
+            print(f"==> Scene {scene_id} has dufo_label, skip.")
+            continue
+
+        mydufo = dufomap(0.2, 0.2, 1, num_threads=12) # resolution, d_s, d_p, hit_extension
+        mydufo.setCluster(0, 20, 0.2) # depth=0, min_points=20, max_dist=0.2
+
+        print(f"==> Scene {scene_id} start, data path: {data_path}")
+        for i in tqdm(range(bounds["min_index"], bounds["max_index"]+1), desc=f"Dufo run: {scene_in_data_index}/{len(all_scene_ids)}", ncols=80):
+            if interval != 1 and i % interval != 0 and (i + interval//2 < bounds["max_index"] or i - interval//2 > bounds["min_index"]):
+                continue
+            data = dataset[i]
+            assert data['scene_id'] == scene_id, f"Check the data, scene_id {scene_id} is not consistent in {i}th data in {scene_in_data_index}th scene."
+            # HARD CODED: remove points outside the range
+            norm_pc0 = np.linalg.norm(data['pc0'][:, :3], axis=1)
+            range_mask = (
+                    (norm_pc0>MIN_AXIS_RANGE) & 
+                    (norm_pc0<MAX_AXIS_RANGE)
+            )
+            pose_array = transform_to_array(data['pose0'])
+            mydufo.run(data['pc0'][range_mask], pose_array, cloud_transform = True)
+
+        # finished integrate, start segment, needed since we have map.label inside dufo
+        mydufo.oncePropagateCluster(if_cluster = True, if_propagate=True)
+        for i in tqdm(range(bounds["min_index"], bounds["max_index"]+1), desc=f"Start Segment: {scene_in_data_index}/{len(all_scene_ids)}", ncols=80):
+            data = dataset[i]
+            pc0 = data['pc0']
+            gm0 = data['gm0']
+            pose_array = transform_to_array(data['pose0'])
+            dufo_label = np.array(mydufo.segment(pc0, pose_array, cloud_transform = True))
+            dufo_labels = np.zeros(pc0.shape[0], dtype= np.uint8)
+            dufo_labels[~gm0] = dufo_label[~gm0]
+
+            # save labels
+            timestamp = data['timestamp']
+            key = str(timestamp)
+            with h5py.File(os.path.join(data_path, f'{scene_id}.h5'), 'r+') as f:
+                if "dufo_label" in f[key]:
+                    # print(f"Warning: {scene_id} {timestamp} has label, will be overwritten.")
+                    del f[key]["dufo_label"]
+                f[key].create_dataset("dufo_label", data=np.array(dufo_labels).astype(np.uint8))
+        print(f"==> Scene {scene_id} finished, used: {(time.time() - start_time)/60:.2f} mins")
+    print(f"Data inside {str(data_path)} finished. Check the result with vis() function if you want to visualize them.")
+
+if __name__ == '__main__':
+    start_time = time.time()
+    # step 1: run dufomap
+    fire.Fire(run_dufo)
+    # step 2: run cluster on dufolabel
+    fire.Fire(run_cluster)
+
+    print(f"\nTime used: {(time.time() - start_time)/60:.2f} mins")
diff --git a/tools/visualization.py b/tools/visualization.py
@@ -62,7 +62,7 @@ def check_flow(
 def vis(
     data_dir: str ="/home/kin/data/av2/preprocess/sensor/mini",
     res_name: str = "flow", # "flow", "flow_est"
-    start_id: int = -1,
+    start_id: int = 0,
     point_size: float = 2.0,
 ):
     dataset = HDF5Data(data_dir, vis_name=res_name, flow_view=True)
@@ -88,7 +88,10 @@ def vis(
         pose_flow = pc0[:, :3] @ ego_pose[:3, :3].T + ego_pose[:3, 3] - pc0[:, :3]
         
         pcd = o3d.geometry.PointCloud()
-        if res_name in ['dufo_label', 'label']:
+        if res_name == 'raw': # no result, only show **raw point cloud**
+            pcd.points = o3d.utility.Vector3dVector(pc0[:, :3])
+            pcd.paint_uniform_color([1.0, 1.0, 1.0])
+        elif res_name in ['dufo_label', 'label']:
             labels = data[res_name]
             pcd_i = o3d.geometry.PointCloud()
             for label_i in np.unique(labels):