Skip to content

Commit a04b3af

Browse files
Merge pull request #2746 from devitocodes/deviceid
compiler: Make nbytes available mapper aware of visible devices environment variables
2 parents c207ded + 35304b9 commit a04b3af

4 files changed

Lines changed: 193 additions & 17 deletions

File tree

devito/arch/archinfo.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@
1616
from devito.logger import warning
1717
from devito.tools import as_tuple, all_equal, memoized_func
1818

19-
__all__ = ['platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_nvidia_cc',
20-
'get_cuda_path', 'get_hip_path', 'check_cuda_runtime', 'get_m1_llvm_path',
21-
'get_advisor_path', 'Platform', 'Cpu64', 'Intel64', 'IntelSkylake', 'Amd',
22-
'Arm', 'Power', 'Device', 'NvidiaDevice', 'AmdDevice', 'IntelDevice',
19+
__all__ = ['platform_registry', 'get_cpu_info', 'get_gpu_info', 'get_visible_devices',
20+
'get_nvidia_cc', 'get_cuda_path', 'get_hip_path', 'check_cuda_runtime',
21+
'get_m1_llvm_path', 'get_advisor_path', 'Platform', 'Cpu64', 'Intel64',
22+
'IntelSkylake', 'Amd', 'Arm', 'Power', 'Device', 'NvidiaDevice',
23+
'AmdDevice', 'IntelDevice',
2324
# Brand-agnostic
2425
'ANYCPU', 'ANYGPU',
2526
# Intel CPUs
@@ -488,6 +489,27 @@ def parse_product_arch():
488489
return None
489490

490491

492+
def get_visible_devices():
493+
device_vars = (
494+
'CUDA_VISIBLE_DEVICES',
495+
'ROCR_VISIBLE_DEVICES',
496+
'HIP_VISIBLE_DEVICES'
497+
)
498+
for v in device_vars:
499+
try:
500+
return tuple(int(i) for i in os.environ[v].split(','))
501+
except ValueError:
502+
# Visible devices set via UUIDs or other non-integer identifiers.
503+
warning("Setting visible devices via UUIDs or other non-integer"
504+
" identifiers is currently unsupported: environment variable"
505+
f" {v}={os.environ[v]} ignored.")
506+
except KeyError:
507+
# Environment variable not set
508+
continue
509+
510+
return None
511+
512+
491513
@memoized_func
492514
def get_nvidia_cc():
493515
libnames = ('libcuda.so', 'libcuda.dylib', 'cuda.dll')

devito/operator/operator.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
import sympy
1111
import numpy as np
1212

13-
from devito.arch import ANYCPU, Device, compiler_registry, platform_registry
13+
from devito.arch import (ANYCPU, Device, compiler_registry, platform_registry,
14+
get_visible_devices)
1415
from devito.data import default_allocator
1516
from devito.exceptions import (CompilationError, ExecutionError, InvalidArgument,
1617
InvalidOperator)
@@ -1388,6 +1389,23 @@ def _get_nbytes(self, i):
13881389

13891390
return nbytes
13901391

1392+
@cached_property
1393+
def _physical_deviceid(self):
1394+
if isinstance(self.platform, Device):
1395+
# Get the physical device ID (as CUDA_VISIBLE_DEVICES may be set)
1396+
logical_deviceid = self.get('deviceid', -1)
1397+
if logical_deviceid < 0:
1398+
rank = self.comm.Get_rank() if self.comm != MPI.COMM_NULL else 0
1399+
logical_deviceid = rank
1400+
1401+
visible_devices = get_visible_devices()
1402+
if visible_devices is None:
1403+
return logical_deviceid
1404+
else:
1405+
return visible_devices[logical_deviceid]
1406+
else:
1407+
return None
1408+
13911409
@cached_property
13921410
def nbytes_avail_mapper(self):
13931411
"""
@@ -1402,8 +1420,8 @@ def nbytes_avail_mapper(self):
14021420

14031421
# The amount of space available on the device
14041422
if isinstance(self.platform, Device):
1405-
deviceid = max(self.get('deviceid', 0), 0)
1406-
mapper[device_layer] = self.platform.memavail(deviceid=deviceid)
1423+
mapper[device_layer] = \
1424+
self.platform.memavail(deviceid=self._physical_deviceid)
14071425

14081426
# The amount of space available on the host
14091427
try:

devito/parameters.py

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from devito.tools import Signer, filter_ordered
99

1010
__all__ = ['configuration', 'init_configuration', 'print_defaults', 'print_state',
11-
'switchconfig']
11+
'switchconfig', 'switchenv']
1212

1313
# Be EXTREMELY careful when writing to a Parameters dictionary
1414
# Read here for reference: http://wiki.c2.com/?GlobalVariablesAreBad
@@ -224,7 +224,22 @@ def init_configuration(configuration=configuration, env_vars_mapper=env_vars_map
224224
configuration.initialize()
225225

226226

227-
class switchconfig:
227+
class abstractswitch:
228+
229+
"""
230+
Abstract class for switch(whatever) decorators.
231+
"""
232+
233+
def __call__(self, func, *args, **kwargs):
234+
@wraps(func)
235+
def wrapper(*args, **kwargs):
236+
with self:
237+
result = func(*args, **kwargs)
238+
return result
239+
return wrapper
240+
241+
242+
class switchconfig(abstractswitch):
228243

229244
"""
230245
Decorator or context manager to temporarily change `configuration` parameters.
@@ -251,13 +266,28 @@ def __exit__(self, exc_type, exc_val, exc_tb):
251266
# E.g., `platform` and `compiler` will end up here
252267
super(Parameters, configuration).__setitem__(k, self.previous[k])
253268

254-
def __call__(self, func, *args, **kwargs):
255-
@wraps(func)
256-
def wrapper(*args, **kwargs):
257-
with self:
258-
result = func(*args, **kwargs)
259-
return result
260-
return wrapper
269+
270+
class switchenv(abstractswitch):
271+
"""
272+
Decorator to temporarily change environment variables.
273+
Adapted from https://stackoverflow.com/questions/2059482/
274+
"""
275+
276+
def __init__(self, condition=True, **params):
277+
self.previous = dict(environ)
278+
279+
if condition:
280+
# Environment variables are essentially always uppercase
281+
self.params = {k.upper(): v for k, v in params.items()}
282+
else:
283+
self.params = params
284+
285+
def __enter__(self, condition=True, **params):
286+
environ.update(self.params)
287+
288+
def __exit__(self, exc_type, exc_val, exc_tb):
289+
environ.clear()
290+
environ.update(self.previous)
261291

262292

263293
def print_defaults():

tests/test_gpu_common.py

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import cloudpickle as pickle
22

33
import pytest
4+
import os
45
import numpy as np
56
import sympy
67
import scipy.sparse
@@ -10,7 +11,7 @@
1011
Dimension, MatrixSparseTimeFunction, SparseTimeFunction,
1112
SubDimension, SubDomain, SubDomainSet, TimeFunction, exp,
1213
Operator, configuration, switchconfig, TensorTimeFunction,
13-
Buffer, assign)
14+
Buffer, assign, switchenv)
1415
from devito.arch import get_gpu_info, get_cpu_info, Device, Cpu64
1516
from devito.exceptions import InvalidArgument
1617
from devito.ir import (Conditional, Expression, Section, FindNodes, FindSymbols,
@@ -74,6 +75,111 @@ def test_autopad_with_platform_switch(self):
7475
assert f.shape_allocated[1] == 64
7576

7677

78+
class TestDeviceID:
79+
"""
80+
Test that device IDs and associated environment variables such as
81+
CUDA_VISIBLE_DEVICES are correctly handled.
82+
"""
83+
84+
@pytest.mark.parametrize('env_variables', [{"cuda_visible_devices": "1"},
85+
{"cuda_visible_devices": "1,2"},
86+
{"cuda_visible_devices": "1,0"},
87+
{"rocr_visible_devices": "1"},
88+
{"hip_visible_devices": " 1"}])
89+
def test_visible_devices(self, env_variables):
90+
"""
91+
Test that physical device IDs used for querying memory on a device via
92+
nvidia-smi correctly account for visible-device environment variables.
93+
"""
94+
grid = Grid(shape=(10, 10))
95+
u = Function(name='u', grid=grid)
96+
97+
eq = Eq(u, u+1)
98+
99+
# Save previous environment to verify switchenv works as intended
100+
previous_environ = dict(os.environ)
101+
102+
with switchenv(**env_variables):
103+
op1 = Operator(eq)
104+
105+
argmap1 = op1.arguments()
106+
# All variants in parameterisation should yield deviceid 1
107+
assert argmap1._physical_deviceid == 1
108+
109+
# Make sure the switchenv doesn't somehow persist
110+
assert dict(os.environ) == previous_environ
111+
112+
# Check that physical deviceid is 0 when no environment variables set
113+
op2 = Operator(eq)
114+
115+
argmap2 = op2.arguments()
116+
# Default physical deviceid expected to be 0
117+
assert argmap2._physical_deviceid == 0
118+
119+
@pytest.mark.parallel(mode=2)
120+
@pytest.mark.parametrize('visible_devices', ["1,2", "1,0", "0,2,3"])
121+
def test_visible_devices_mpi(self, visible_devices, mode):
122+
"""
123+
Test that physical device IDs used for querying memory on a device via
124+
nvidia-smi correctly account for visible-device environment variables
125+
when using MPI.
126+
"""
127+
128+
grid = Grid(shape=(10, 10))
129+
rank = grid.distributor.myrank
130+
u = Function(name='u', grid=grid)
131+
132+
eq = Eq(u, u+1)
133+
134+
with switchenv(cuda_visible_devices=visible_devices):
135+
op1 = Operator(eq)
136+
argmap1 = op1.arguments()
137+
138+
devices = [int(i) for i in visible_devices.split(',')]
139+
140+
assert argmap1._physical_deviceid == devices[rank]
141+
142+
# In default case, physical deviceid will equal rank
143+
op2 = Operator(eq)
144+
argmap2 = op2.arguments()
145+
assert argmap2._physical_deviceid == rank
146+
147+
def test_visible_devices_with_devito_deviceid(self):
148+
"""Test interaction between CUDA_VISIBLE_DEVICES and DEVITO_DEVICEID"""
149+
grid = Grid(shape=(10, 10))
150+
u = Function(name='u', grid=grid)
151+
152+
eq = Eq(u, u+1)
153+
154+
with switchenv(cuda_visible_devices="1,3"), switchconfig(deviceid=1):
155+
op = Operator(eq)
156+
157+
argmap = op.arguments()
158+
# deviceid should see the world from within CUDA_VISIBLE_DEVICES
159+
# So should be the second of the two visible devices specified (3)
160+
assert argmap._physical_deviceid == 3
161+
162+
@pytest.mark.parallel(mode=2)
163+
def test_deviceid_per_rank(self, mode):
164+
"""
165+
Test that Device IDs set by the user on a per-rank basis do not
166+
get modifed.
167+
"""
168+
# Reversed order to ensure it is different to default
169+
user_set_deviceids = (1, 0)
170+
171+
grid = Grid(shape=(10, 10))
172+
u = Function(name='u', grid=grid)
173+
174+
rank = grid.distributor.myrank
175+
deviceid = user_set_deviceids[rank]
176+
177+
op = Operator(Eq(u, u+1))
178+
179+
argmap = op.arguments(deviceid=deviceid)
180+
assert argmap._physical_deviceid == deviceid
181+
182+
77183
class TestCodeGeneration:
78184

79185
def test_maxpar_option(self):

0 commit comments

Comments
 (0)