Merge branch 'master' of https://github.com/stan-dev/cmdstanpy

mitzimorris · mitzimorris · commit 5f4256e5c20d · 2020-08-06T18:02:36.000-04:00
diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py
@@ -470,7 +470,7 @@ def sample(
         :param parallel_chains: Number of processes to run in parallel. Must be
             a positive integer.  Defaults to ``multiprocessing.cpu_count()``.
 
-        :param threads_per_chain: the number of threads to use in parallelized
+        :param threads_per_chain: The number of threads to use in parallelized
             sections within an MCMC chain (e.g., when using the Stan functions
             ``reduce_sum()``  or ``map_rect()``).  This will only have an effect
             if the model was compiled with threading support. The total number
@@ -597,12 +597,12 @@ def sample(
             chain_ids = [x + 1 for x in range(chains)]
         else:
             if isinstance(chain_ids, int):
-                if chain_ids < 0:
+                if chain_ids < 1:
                     raise ValueError(
-                        'Chain_id must be a non-negative integer value,'
+                        'Chain_id must be a positive integer value,'
                         ' found {}.'.format(chain_ids)
                     )
-                chain_ids = [chain_ids + i + 1 for i in range(chains)]
+                chain_ids = [chain_ids + i for i in range(chains)]
             else:
                 if not len(chain_ids) == chains:
                     raise ValueError(
@@ -691,7 +691,7 @@ def sample(
                 refresh=refresh,
                 logger=self._logger,
             )
-            runset = RunSet(args=args, chains=chains)
+            runset = RunSet(args=args, chains=chains, chain_ids=chain_ids)
             pbar = None
             all_pbars = []
 
@@ -818,16 +818,19 @@ def generate_quantities(
             sample_csv_files = mcmc_sample.runset.csv_files
             sample_drawset = mcmc_sample.get_drawset()
             chains = mcmc_sample.chains
+            chain_ids = mcmc_sample.chain_ids
         elif isinstance(mcmc_sample, list):
+            if len(mcmc_sample) < 1:
+                raise ValueError('MCMC sample cannot be empty list')
             sample_csv_files = mcmc_sample
+            chains = len(sample_csv_files)
+            chain_ids = [x + 1 for x in range(chains)]
         else:
             raise ValueError(
                 'MCMC sample must be either CmdStanMCMC object'
                 ' or list of paths to sample csv_files.'
             )
-
         try:
-            chains = len(sample_csv_files)
             if sample_drawset is None:  # assemble sample from csv files
                 config = {}
                 # scan 1st csv file to get config
@@ -852,10 +855,10 @@ def generate_quantities(
                 args = CmdStanArgs(
                     self._name,
                     self._exe_file,
-                    chain_ids=[x + 1 for x in range(chains)],
+                    chain_ids=chain_ids,
                     method_args=sampler_args,
                 )
-                runset = RunSet(args=args, chains=chains)
+                runset = RunSet(args=args, chains=chains, chain_ids=chain_ids)
                 runset._csv_files = sample_csv_files
                 sample_fit = CmdStanMCMC(runset)
                 sample_drawset = sample_fit.get_drawset()
@@ -875,13 +878,13 @@ def generate_quantities(
             args = CmdStanArgs(
                 self._name,
                 self._exe_file,
-                chain_ids=[x + 1 for x in range(chains)],
+                chain_ids=chain_ids,
                 data=_data,
                 seed=seed,
                 output_dir=gq_output_dir,
                 method_args=generate_quantities_args,
             )
-            runset = RunSet(args=args, chains=chains)
+            runset = RunSet(args=args, chains=chains, chain_ids=chain_ids)
 
             parallel_chains_avail = cpu_count()
             parallel_chains = max(min(parallel_chains_avail - 2, chains), 1)
diff --git a/cmdstanpy/stanfit.py b/cmdstanpy/stanfit.py
@@ -34,7 +34,11 @@ class RunSet:
     """
 
     def __init__(
-        self, args: CmdStanArgs, chains: int = 4, logger: logging.Logger = None
+        self,
+        args: CmdStanArgs,
+        chains: int = 4,
+        chain_ids: List[int] = None,
+        logger: logging.Logger = None,
     ) -> None:
         """Initialize object."""
         self._args = args
@@ -45,7 +49,16 @@ def __init__(
                 'chains must be positive integer value, '
                 'found {}'.format(chains)
             )
-
+        if chain_ids is None:
+            chain_ids = [x + 1 for x in range(chains)]
+        elif len(chain_ids) != chains:
+            raise ValueError(
+                'mismatch between number of chains and chain_ids, '
+                'found {} chains, but {} chain_ids'.format(
+                    chains, len(chain_ids)
+                )
+            )
+        self._chain_ids = chain_ids
         self._retcodes = [-1 for _ in range(chains)]
 
         # stdout, stderr are written to text files
@@ -67,12 +80,13 @@ def __init__(
             if args.output_dir is None:
                 csv_file = create_named_text_file(
                     dir=output_dir,
-                    prefix='{}-{}-'.format(file_basename, i + 1),
+                    prefix='{}-{}-'.format(file_basename, str(chain_ids[i])),
                     suffix='.csv',
                 )
             else:
                 csv_file = os.path.join(
-                    output_dir, '{}-{}.{}'.format(file_basename, i + 1, 'csv')
+                    output_dir,
+                    '{}-{}.{}'.format(file_basename, str(chain_ids[i]), 'csv'),
                 )
             self._csv_files[i] = csv_file
             stdout_file = ''.join(
@@ -87,14 +101,16 @@ def __init__(
                 if args.output_dir is None:
                     diag_file = create_named_text_file(
                         dir=_TMPDIR,
-                        prefix='{}-diagnostic-{}-'.format(file_basename, i + 1),
+                        prefix='{}-diagnostic-{}-'.format(
+                            file_basename, str(chain_ids[i])
+                        ),
                         suffix='.csv',
                     )
                 else:
                     diag_file = os.path.join(
                         output_dir,
                         '{}-diagnostic-{}.{}'.format(
-                            file_basename, i + 1, 'csv'
+                            file_basename, str(chain_ids[i]), 'csv'
                         ),
                     )
                 self._diagnostic_files[i] = diag_file
@@ -126,9 +142,14 @@ def method(self) -> Method:
 
     @property
     def chains(self) -> int:
-        """Number of sampler chains."""
+        """Number of chains."""
         return self._chains
 
+    @property
+    def chain_ids(self) -> List[int]:
+        """Chain ids."""
+        return self._chain_ids
+
     @property
     def cmds(self) -> List[str]:
         """Per-chain call to CmdStan."""
@@ -297,6 +318,11 @@ def chains(self) -> int:
         """Number of chains."""
         return self.runset.chains
 
+    @property
+    def chain_ids(self) -> List[int]:
+        """Chain ids."""
+        return self.runset.chain_ids
+
     @property
     def num_draws(self) -> int:
         """Number of post-warmup draws per chain."""
@@ -499,12 +525,34 @@ def _assemble_sample(self) -> None:
                     xs = line.split(',')
                     self._sample[i, chain, :] = [float(x) for x in xs]
 
-    def summary(self) -> pd.DataFrame:
+    def summary(self, percentiles: List[int] = None) -> pd.DataFrame:
         """
         Run cmdstan/bin/stansummary over all output csv files.
         Echo stansummary stdout/stderr to console.
         Assemble csv tempfile contents into pandasDataFrame.
+
+        :param percentiles: Ordered non-empty list of percentiles to report.
+            Must be integers from (1, 99), inclusive.
         """
+        percentiles_str = '--percentiles=5,50,95'
+        if percentiles is not None:
+            if len(percentiles) == 0:
+                raise ValueError(
+                    'invalid percentiles argument, must be ordered'
+                    ' non-empty list from (1, 99), inclusive.'
+                )
+
+            cur_pct = 0
+            for pct in percentiles:
+                if pct > 99 or not pct > cur_pct:
+                    raise ValueError(
+                        'invalid percentiles spec, must be ordered'
+                        ' non-empty list from (1, 99), inclusive.'
+                    )
+                cur_pct = pct
+            percentiles_str = '='.join(
+                ['--percentiles', ','.join([str(x) for x in percentiles])]
+            )
         cmd_path = os.path.join(
             cmdstan_path(), 'bin', 'stansummary' + EXTENSION
         )
@@ -516,6 +564,7 @@ def summary(self) -> pd.DataFrame:
         )
         cmd = [
             cmd_path,
+            percentiles_str,
             '--csv_file={}'.format(tmp_csv_path),
         ] + self.runset.csv_files
         do_command(cmd, logger=self.runset._logger)
diff --git a/test/test_generate_quantities.py b/test/test_generate_quantities.py
@@ -60,6 +60,9 @@ def test_gen_quantities_csv_files_bad(self):
         model = CmdStanModel(stan_file=stan)
         jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
 
+        with self.assertRaises(ValueError):
+            model.generate_quantities(data=jdata, mcmc_sample=[])
+
         # synthesize list of filenames
         goodfiles_path = os.path.join(
             DATAFILES_PATH, 'runset-bad', 'bad-draws-bern'
diff --git a/test/test_runset.py b/test/test_runset.py
@@ -16,14 +16,15 @@ def test_check_retcodes(self):
         exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
         jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
         sampler_args = SamplerArgs()
+        chain_ids = [1, 2, 3, 4]  # default
         cmdstan_args = CmdStanArgs(
             model_name='bernoulli',
             model_exe=exe,
-            chain_ids=[1, 2, 3, 4],
+            chain_ids=chain_ids,
             data=jdata,
             method_args=sampler_args,
         )
-        runset = RunSet(args=cmdstan_args, chains=4)
+        runset = RunSet(args=cmdstan_args)
         self.assertIn('RunSet: chains=4', runset.__repr__())
         self.assertIn('method=sample', runset.__repr__())
 
@@ -44,14 +45,15 @@ def test_get_err_msgs(self):
         exe = os.path.join(DATAFILES_PATH, 'logistic' + EXTENSION)
         rdata = os.path.join(DATAFILES_PATH, 'logistic.data.R')
         sampler_args = SamplerArgs()
+        chain_ids = [1, 2, 3]
         cmdstan_args = CmdStanArgs(
             model_name='logistic',
             model_exe=exe,
-            chain_ids=[1, 2, 3],
+            chain_ids=chain_ids,
             data=rdata,
             method_args=sampler_args,
         )
-        runset = RunSet(args=cmdstan_args, chains=3)
+        runset = RunSet(args=cmdstan_args, chains=3, chain_ids=chain_ids)
         for i in range(3):
             runset._set_retcode(i, 70)
             stdout_file = 'chain-' + str(i + 1) + '-missing-data-stdout.txt'
@@ -64,14 +66,15 @@ def test_output_filenames(self):
         exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
         jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
         sampler_args = SamplerArgs()
+        chain_ids = [1, 2, 3, 4]
         cmdstan_args = CmdStanArgs(
             model_name='bernoulli',
             model_exe=exe,
-            chain_ids=[1, 2, 3, 4],
+            chain_ids=chain_ids,
             data=jdata,
             method_args=sampler_args,
         )
-        runset = RunSet(args=cmdstan_args, chains=4)
+        runset = RunSet(args=cmdstan_args)
         self.assertIn('bernoulli-', runset._csv_files[0])
         self.assertIn('-1-', runset._csv_files[0])
         self.assertIn('-4-', runset._csv_files[3])
@@ -80,17 +83,53 @@ def test_commands(self):
         exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
         jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
         sampler_args = SamplerArgs()
+        chain_ids = [1, 2, 3, 4]
         cmdstan_args = CmdStanArgs(
             model_name='bernoulli',
             model_exe=exe,
-            chain_ids=[1, 2, 3, 4],
+            chain_ids=chain_ids,
             data=jdata,
             method_args=sampler_args,
         )
-        runset = RunSet(args=cmdstan_args, chains=4)
+        runset = RunSet(args=cmdstan_args)
         self.assertIn('id=1', runset._cmds[0])
         self.assertIn('id=4', runset._cmds[3])
 
+    def test_chain_ids(self):
+        exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
+        jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
+        sampler_args = SamplerArgs()
+        chain_ids = [11, 12, 13, 14]
+        cmdstan_args = CmdStanArgs(
+            model_name='bernoulli',
+            model_exe=exe,
+            chain_ids=chain_ids,
+            data=jdata,
+            method_args=sampler_args,
+        )
+        runset = RunSet(args=cmdstan_args, chains=4, chain_ids=chain_ids)
+        self.assertIn('id=11', runset._cmds[0])
+        self.assertIn('-11-', runset._csv_files[0])
+        self.assertIn('id=14', runset._cmds[3])
+        self.assertIn('-14-', runset._csv_files[3])
+
+    def test_ctor_checks(self):
+        exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
+        jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
+        sampler_args = SamplerArgs()
+        chain_ids = [11, 12, 13, 14]
+        cmdstan_args = CmdStanArgs(
+            model_name='bernoulli',
+            model_exe=exe,
+            chain_ids=chain_ids,
+            data=jdata,
+            method_args=sampler_args,
+        )
+        with self.assertRaises(ValueError):
+            RunSet(args=cmdstan_args, chains=0)
+        with self.assertRaises(ValueError):
+            RunSet(args=cmdstan_args, chains=4, chain_ids=[1, 2, 3])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/test_sample.py b/test/test_sample.py