Skip to content

Commit b480cde

Browse files
EdCauntmloubout
authored andcommitted
compiler: Make nbytes_avail_mapper aware of visible devices
1 parent 4d270cf commit b480cde

2 files changed

Lines changed: 28 additions & 22 deletions

File tree

devito/operator/operator.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from collections import OrderedDict, namedtuple
23
from functools import cached_property
34
import ctypes
@@ -1388,6 +1389,23 @@ def _get_nbytes(self, i):
13881389

13891390
return nbytes
13901391

1392+
@cached_property
1393+
def visible_devices(self):
1394+
device_vars = (
1395+
'CUDA_VISIBLE_DEVICES',
1396+
'ROCR_VISIBLE_DEVICES',
1397+
'HIP_VISIBLE_DEVICES'
1398+
)
1399+
for v in device_vars:
1400+
if v in os.environ:
1401+
try:
1402+
return tuple(int(i) for i in os.environ[v].split(','))
1403+
except ValueError:
1404+
# Visible devices set via UUIDs or other non-integer identifiers
1405+
continue
1406+
1407+
return None
1408+
13911409
@cached_property
13921410
def nbytes_avail_mapper(self):
13931411
"""
@@ -1402,11 +1420,16 @@ def nbytes_avail_mapper(self):
14021420

14031421
# The amount of space available on the device
14041422
if isinstance(self.platform, Device):
1405-
deviceid = max(self.get('deviceid', 0), 0)
1406-
# FIXME: I think this perhaps picks the wrong device when CUDA_VISIBLE_DEVICES set?
1407-
# Looks like it uses the physical device ID, not the logical one due to dependence
1408-
# on Nvidia SMI -> remote into Timewarp and check this
1409-
mapper[device_layer] = self.platform.memavail(deviceid=deviceid)
1423+
# Get the physical device ID (as CUDA_VISIBLE_DEVICES may be set)
1424+
rank = self.comm.Get_rank() if self.comm != MPI.COMM_NULL else 0
1425+
1426+
logical_deviceid = max(self.get('deviceid', 0), 0) + rank
1427+
if self.visible_devices is not None:
1428+
physical_deviceid = self.visible_devices[logical_deviceid]
1429+
else:
1430+
physical_deviceid = logical_deviceid
1431+
1432+
mapper[device_layer] = self.platform.memavail(deviceid=physical_deviceid)
14101433

14111434
# The amount of space available on the host
14121435
try:

devito/types/parallel.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -279,23 +279,6 @@ class DeviceID(DeviceSymbol):
279279
def default_value(self):
280280
return -1
281281

282-
@cached_property
283-
def visible_devices(self):
284-
device_vars = (
285-
'CUDA_VISIBLE_DEVICES',
286-
'ROCR_VISIBLE_DEVICES',
287-
'HIP_VISIBLE_DEVICES'
288-
)
289-
for v in device_vars:
290-
if v in os.environ:
291-
try:
292-
return tuple(int(i) for i in os.environ[v].split(','))
293-
except ValueError:
294-
# Visible devices set via UUIDs or other non-integer identifiers
295-
continue
296-
297-
return None
298-
299282
def _arg_values(self, **kwargs):
300283
if self.name in kwargs:
301284
return {self.name: kwargs.pop(self.name)}

0 commit comments

Comments
 (0)