Skip to content

Commit 2b4b272

Browse files
committed
docs(process): updates.
Merge branch 'SeFlow' into DeFlow
2 parents 37fc9d3 + 7ad5b85 commit 2b4b272

5 files changed

Lines changed: 181 additions & 26 deletions

File tree

dataprocess/README.md

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,47 +12,47 @@ We've updated the process dataset for:
1212
- [x] Waymo: check [here](#waymo-dataset). The process script was involved from [SeFlow](https://github.com/KTH-RPL/SeFlow).
1313
- [ ] nuScenes: done coding, public after review. Will be involved later by another paper.
1414

15-
If you want to use all datasets above, there is a specific process environment in [envprocess.yml](../envprocess.yml) to install all the necessary packages. As Waymo package have different configuration and conflict with the main environment. Setup through the following command:
15+
If you want to use all datasets above, there is a specific process environment in [envprocess.yaml](../envprocess.yaml) to install all the necessary packages. As Waymo package have different configuration and conflict with the main environment. Setup through the following command:
1616

1717
```bash
18-
conda env create -f envprocess.yml
18+
conda env create -f envprocess.yaml
1919
conda activate dataprocess
20+
# NOTE we need **manually reinstall numpy** (higher than 1.22)
21+
# * since waymo package force numpy==1.21.5, BUT!
22+
# * hdbscan w. numpy<1.22.0 will raise error: 'numpy.float64' object cannot be interpreted as an integer
23+
# * av2 need numpy >=1.22.0, waymo with numpy==1.22.0 will be fine on code running.
24+
pip install numpy==1.22
2025
```
2126

2227
## Download
2328

2429
### Argoverse 2.0
2530

26-
Install their download tool:
27-
```bash
28-
mamba install s5cmd -c conda-forge
29-
```
30-
31-
Download the dataset:
31+
Install their download tool `s5cmd`, already in our envprocess.yaml. Then download the dataset:
3232
```bash
3333
# train is really big (750): totally 966 GB
34-
s5cmd --no-sign-request cp "s3://argoverse/datasets/av2/sensor/train/*" sensor/train
34+
s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/sensor/train/*" av2/sensor/train
3535

3636
# val (150) and test (150): totally 168GB + 168GB
37-
s5cmd --no-sign-request cp "s3://argoverse/datasets/av2/sensor/val/*" sensor/val
38-
s5cmd --no-sign-request cp "s3://argoverse/datasets/av2/sensor/test/*" sensor/test
37+
s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/sensor/val/*" av2/sensor/val
38+
s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/sensor/test/*" av2/sensor/test
3939

4040
# for local and online eval mask from official repo
4141
s5cmd --no-sign-request cp "s3://argoverse/tasks/3d_scene_flow/zips/*" .
4242
```
4343

4444
Then to quickly pre-process the data, we can [read these commands](#process) on how to generate the pre-processed data for training and evaluation. This will take around 0.5-2 hour for the whole dataset (train & val) based on how powerful your CPU is.
4545

46-
More [self-supervised data in AV2 LiDAR only](https://www.argoverse.org/av2.html#lidar-link), note: It **does not** include **imagery or 3D annotations**. The dataset is designed to support research into self-supervised learning in the lidar domain, as well as point cloud forecasting.
46+
Optional: More [self-supervised data in AV2 LiDAR only](https://www.argoverse.org/av2.html#lidar-link), note: It **does not** include **imagery or 3D annotations**. The dataset is designed to support research into self-supervised learning in the lidar domain, as well as point cloud forecasting.
4747
```bash
4848
# train is really big (16000): totally 4 TB
49-
s5cmd --no-sign-request cp "s3://argoverse/datasets/av2/lidar/train/*" lidar/train
49+
s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/lidar/train/*" av2/lidar/train
5050

5151
# val (2000): totally 0.5 TB
52-
s5cmd --no-sign-request cp "s3://argoverse/datasets/av2/lidar/val/*" lidar/val
52+
s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/lidar/val/*" av2/lidar/val
5353

5454
# test (2000): totally 0.5 TB
55-
s5cmd --no-sign-request cp "s3://argoverse/datasets/av2/lidar/test/*" lidar/test
55+
s5cmd --numworkers 12 --no-sign-request cp "s3://argoverse/datasets/av2/lidar/test/*" av2/lidar/test
5656
```
5757

5858
#### Dataset frames

dataprocess/extract_waymo.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,6 @@ def create_group_data(group, pc, pose, gm = None, flow_0to1=None, flow_valid=Non
346346
first_frame = dataset_pb2.Frame.FromString(bytearray(all_data[0]))
347347
scene_id = first_frame.context.name
348348
total_lens = len(all_data)
349-
# for data_idx in tqdm(range(1, total_lens), ncols=100):
350349
for data_idx in range(1, total_lens):
351350
if data_idx >= total_lens - 2:
352351
# 0: no correct flow label, end(total_lens - 1) - 1: no correct pose flow
@@ -384,7 +383,6 @@ def process_logs(data_dir: Path, map_dir: Path, output_dir: Path, nproc: int):
384383
data_dir: Argoverse 2.0 directory
385384
output_dir: Output directory.
386385
"""
387-
388386
if not (data_dir).exists():
389387
print(f'{data_dir} not found')
390388
return
@@ -408,7 +406,7 @@ def process_logs(data_dir: Path, map_dir: Path, output_dir: Path, nproc: int):
408406
def main(
409407
flow_data_dir: str = "/home/kin/data/waymo/flowlabel",
410408
mode: str = "test",
411-
map_dir: str = "/home/kin/data/waymo/flowlabel/maps",
409+
map_dir: str = "/home/kin/data/waymo/flowlabel/map",
412410
output_dir: str ="/home/kin/data/waymo/flowlabel/preprocess",
413411
nproc: int = (multiprocessing.cpu_count() - 1),
414412
create_index_only: bool = False,

envprocess.yaml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,24 @@ dependencies:
66
- python=3.8
77
- pytorch::pytorch=2.0.0
88
- pytorch::torchvision
9+
- mkl==2024.0.0
910
- numba
10-
- numpy==1.22
11+
- numpy
1112
- pandas
1213
- pip
1314
- scipy
1415
- tqdm
15-
- scikit-learn
1616
- fire
17+
- hdbscan
18+
- s5cmd
1719
- pip:
1820
- nuscenes-devkit
1921
- av2==0.2.1
2022
- waymo-open-dataset-tf-2.11.0==1.5.0
21-
- open3d==0.16.0
23+
- dufomap==1.0.0
2224
- linefit
2325
- dztimer
24-
- dufomap==1.0.0
25-
- evalai
2626

2727
# Reason about the version fixed:
28-
# numpy==1.22: package conflicts, need numpy higher or same 1.22
28+
# numpy==1.22: package conflicts, need numpy higher or same 1.22
29+
# mkl==2024.0.0: https://github.com/pytorch/pytorch/issues/123097

process.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
"""
2+
# Created: 2023-11-04 15:55
3+
# Copyright (C) 2023-now, RPL, KTH Royal Institute of Technology
4+
# Author: Qingwen Zhang (https://kin-zhang.github.io/)
5+
#
6+
# This file is part of SeFlow (https://github.com/KTH-RPL/SeFlow).
7+
# If you find this repo helpful, please cite the respective publication as
8+
# listed on the above website.
9+
#
10+
# Description: run dufomap on the dataset we preprocessed for afterward ssl training.
11+
# it's only needed for ssl train but not inference.
12+
# Goal to segment dynamic and static point roughly.
13+
"""
14+
15+
from pathlib import Path
16+
from tqdm import tqdm
17+
import numpy as np
18+
import fire, time, h5py, os
19+
from hdbscan import HDBSCAN
20+
21+
from src.utils.mics import HDF5Data, transform_to_array
22+
from dufomap import dufomap
23+
24+
MIN_AXIS_RANGE = 2 # HARD CODED: remove ego vehicle points
25+
MAX_AXIS_RANGE = 50 # HARD CODED: remove far away points
26+
27+
def run_cluster(
28+
data_dir: str ="/home/kin/data/av2/preprocess/sensor/train",
29+
scene_range: list = [0, 1],
30+
interval: int = 1, # useless here, just for the same interface args
31+
overwrite: bool = False,
32+
):
33+
data_path = Path(data_dir)
34+
dataset = HDF5Data(data_path)
35+
all_scene_ids = list(dataset.scene_id_bounds.keys())
36+
for scene_in_data_index, scene_id in enumerate(all_scene_ids):
37+
start_time = time.time()
38+
# NOTE (Qingwen): so the scene id range is [start, end)
39+
if scene_range[0]!= -1 and scene_range[-1]!= -1 and (scene_in_data_index < scene_range[0] or scene_in_data_index >= scene_range[1]):
40+
continue
41+
bounds = dataset.scene_id_bounds[scene_id]
42+
flag_exist_label = True
43+
with h5py.File(os.path.join(data_path, f'{scene_id}.h5'), 'r+') as f:
44+
for ii in range(bounds["min_index"], bounds["max_index"]+1):
45+
key = str(dataset[ii]['timestamp'])
46+
if 'label' not in f[key]:
47+
flag_exist_label = False
48+
break
49+
if flag_exist_label and not overwrite:
50+
print(f"==> Scene {scene_id} has plus label, skip.")
51+
continue
52+
53+
hdb = HDBSCAN(min_cluster_size=20, cluster_selection_epsilon=0.7)
54+
for i in tqdm(range(bounds["min_index"], bounds["max_index"]+1), desc=f"Start Plus Cluster: {scene_in_data_index}/{len(all_scene_ids)}", ncols=80):
55+
data = dataset[i]
56+
pc0 = data['pc0'][:,:3]
57+
cluster_label = np.zeros(pc0.shape[0], dtype= np.int16)
58+
59+
if "dufo_label" not in data:
60+
print(f"Warning: {scene_id} {data['timestamp']} has no dufo_label, will be skipped. Better to rerun dufomap again in this scene.")
61+
continue
62+
elif data["dufo_label"].sum() < 20:
63+
print(f"Warning: {scene_id} {data['timestamp']} has no dynamic points, will be skipped. Better to check this scene.")
64+
else:
65+
hdb.fit(pc0[data["dufo_label"]==1])
66+
# NOTE(Qingwen): since -1 will be assigned if no cluster. We set it to 0.
67+
cluster_label[data["dufo_label"]==1] = hdb.labels_ + 1
68+
69+
# save labels
70+
timestamp = data['timestamp']
71+
key = str(timestamp)
72+
with h5py.File(os.path.join(data_path, f'{scene_id}.h5'), 'r+') as f:
73+
if 'label' in f[key]:
74+
# print(f"Warning: {scene_id} {timestamp} has label, will be overwritten.")
75+
del f[key]['label']
76+
f[key].create_dataset('label', data=np.array(cluster_label).astype(np.int16))
77+
print(f"==> Scene {scene_id} finished, used: {(time.time() - start_time)/60:.2f} mins")
78+
print(f"Data inside {str(data_path)} finished. Check the result with vis() function if you want to visualize them.")
79+
80+
def run_dufo(
81+
data_dir: str ="/home/kin/data/av2/preprocess/sensor/train",
82+
scene_range: list = [0, 1],
83+
interval: int = 1, # interval frames to run dufomap
84+
overwrite: bool = False,
85+
):
86+
data_path = Path(data_dir)
87+
dataset = HDF5Data(data_path)
88+
all_scene_ids = list(dataset.scene_id_bounds.keys())
89+
for scene_in_data_index, scene_id in enumerate(all_scene_ids):
90+
start_time = time.time()
91+
# NOTE (Qingwen): so the scene id range is [start, end)
92+
if scene_range[0]!= -1 and scene_range[-1]!= -1 and (scene_in_data_index < scene_range[0] or scene_in_data_index >= scene_range[1]):
93+
continue
94+
bounds = dataset.scene_id_bounds[scene_id]
95+
flag_has_dufo_label = True
96+
with h5py.File(os.path.join(data_path, f'{scene_id}.h5'), 'r+') as f:
97+
for ii in range(bounds["min_index"], bounds["max_index"]+1):
98+
key = str(dataset[ii]['timestamp'])
99+
if "dufo_label" not in f[key]:
100+
flag_has_dufo_label = False
101+
break
102+
if flag_has_dufo_label and not overwrite:
103+
print(f"==> Scene {scene_id} has dufo_label, skip.")
104+
continue
105+
106+
mydufo = dufomap(0.2, 0.2, 1, num_threads=12) # resolution, d_s, d_p, hit_extension
107+
mydufo.setCluster(0, 20, 0.2) # depth=0, min_points=20, max_dist=0.2
108+
109+
print(f"==> Scene {scene_id} start, data path: {data_path}")
110+
for i in tqdm(range(bounds["min_index"], bounds["max_index"]+1), desc=f"Dufo run: {scene_in_data_index}/{len(all_scene_ids)}", ncols=80):
111+
if interval != 1 and i % interval != 0 and (i + interval//2 < bounds["max_index"] or i - interval//2 > bounds["min_index"]):
112+
continue
113+
data = dataset[i]
114+
assert data['scene_id'] == scene_id, f"Check the data, scene_id {scene_id} is not consistent in {i}th data in {scene_in_data_index}th scene."
115+
# HARD CODED: remove points outside the range
116+
norm_pc0 = np.linalg.norm(data['pc0'][:, :3], axis=1)
117+
range_mask = (
118+
(norm_pc0>MIN_AXIS_RANGE) &
119+
(norm_pc0<MAX_AXIS_RANGE)
120+
)
121+
pose_array = transform_to_array(data['pose0'])
122+
mydufo.run(data['pc0'][range_mask], pose_array, cloud_transform = True)
123+
124+
# finished integrate, start segment, needed since we have map.label inside dufo
125+
mydufo.oncePropagateCluster(if_cluster = True, if_propagate=True)
126+
for i in tqdm(range(bounds["min_index"], bounds["max_index"]+1), desc=f"Start Segment: {scene_in_data_index}/{len(all_scene_ids)}", ncols=80):
127+
data = dataset[i]
128+
pc0 = data['pc0']
129+
gm0 = data['gm0']
130+
pose_array = transform_to_array(data['pose0'])
131+
dufo_label = np.array(mydufo.segment(pc0, pose_array, cloud_transform = True))
132+
dufo_labels = np.zeros(pc0.shape[0], dtype= np.uint8)
133+
dufo_labels[~gm0] = dufo_label[~gm0]
134+
135+
# save labels
136+
timestamp = data['timestamp']
137+
key = str(timestamp)
138+
with h5py.File(os.path.join(data_path, f'{scene_id}.h5'), 'r+') as f:
139+
if "dufo_label" in f[key]:
140+
# print(f"Warning: {scene_id} {timestamp} has label, will be overwritten.")
141+
del f[key]["dufo_label"]
142+
f[key].create_dataset("dufo_label", data=np.array(dufo_labels).astype(np.uint8))
143+
print(f"==> Scene {scene_id} finished, used: {(time.time() - start_time)/60:.2f} mins")
144+
print(f"Data inside {str(data_path)} finished. Check the result with vis() function if you want to visualize them.")
145+
146+
if __name__ == '__main__':
147+
start_time = time.time()
148+
# step 1: run dufomap
149+
fire.Fire(run_dufo)
150+
# step 2: run cluster on dufolabel
151+
fire.Fire(run_cluster)
152+
153+
print(f"\nTime used: {(time.time() - start_time)/60:.2f} mins")

tools/visualization.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def check_flow(
6262
def vis(
6363
data_dir: str ="/home/kin/data/av2/preprocess/sensor/mini",
6464
res_name: str = "flow", # "flow", "flow_est"
65-
start_id: int = -1,
65+
start_id: int = 0,
6666
point_size: float = 2.0,
6767
):
6868
dataset = HDF5Data(data_dir, vis_name=res_name, flow_view=True)
@@ -88,7 +88,10 @@ def vis(
8888
pose_flow = pc0[:, :3] @ ego_pose[:3, :3].T + ego_pose[:3, 3] - pc0[:, :3]
8989

9090
pcd = o3d.geometry.PointCloud()
91-
if res_name in ['dufo_label', 'label']:
91+
if res_name == 'raw': # no result, only show **raw point cloud**
92+
pcd.points = o3d.utility.Vector3dVector(pc0[:, :3])
93+
pcd.paint_uniform_color([1.0, 1.0, 1.0])
94+
elif res_name in ['dufo_label', 'label']:
9295
labels = data[res_name]
9396
pcd_i = o3d.geometry.PointCloud()
9497
for label_i in np.unique(labels):

0 commit comments

Comments
 (0)