Merge branch 'develop' of https://github.com/stan-dev/cmdstanpy into develop

mitzimorris · mitzimorris · commit cde239299d6b · 2020-08-09T13:22:01.000-04:00
diff --git a/cmdstanpy/_version.py b/cmdstanpy/_version.py
@@ -1,3 +1,3 @@
 """PyPi Version"""
 
-__version__ = '0.9.62'
+__version__ = '0.9.63'
diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py
@@ -436,6 +436,7 @@ def sample(
         output_dir: str = None,
         save_diagnostics: bool = False,
         show_progress: Union[bool, str] = False,
+        validate_csv: bool = True,
     ) -> CmdStanMCMC:
         """
         Run or more chains of the NUTS sampler to produce a set of draws
@@ -580,6 +581,11 @@ def sample(
             If show_progress=='notebook' use tqdm_notebook
             (needs nodejs for jupyter).
 
+        :param validate_csv: If ``False``, skip scan of sample csv output file.
+            When sample is large or disk i/o is slow, will speed up processing.
+            Default is ``True`` - sample csv files are scanned for completeness
+            and consistency.
+
         :return: CmdStanMCMC object
         """
         if chains is None:
@@ -620,7 +626,7 @@ def sample(
         if parallel_chains is None:
             parallel_chains = max(min(cpu_count(), chains), 1)
         elif parallel_chains > chains:
-            self._logger.warning(
+            self._logger.info(
                 'Requesting %u parallel_chains for %u chains,'
                 ' running all chains in parallel.',
                 parallel_chains,
@@ -756,7 +762,7 @@ def sample(
                     err_msg = '{}{}'.format(err_msg, ''.join(console_errs))
                 raise RuntimeError(err_msg)
 
-            mcmc = CmdStanMCMC(runset)
+            mcmc = CmdStanMCMC(runset, validate_csv, logger=self._logger)
         return mcmc
 
     def generate_quantities(
diff --git a/cmdstanpy/stanfit.py b/cmdstanpy/stanfit.py
@@ -5,6 +5,7 @@
 import shutil
 import copy
 import logging
+import math
 from typing import List, Tuple, Dict
 from collections import Counter, OrderedDict
 from datetime import datetime
@@ -272,14 +273,21 @@ class CmdStanMCMC:
     Container for outputs from CmdStan sampler run.
     """
 
-    def __init__(self, runset: RunSet) -> None:
+    # pylint: disable=too-many-instance-attributes
+    def __init__(
+        self,
+        runset: RunSet,
+        validate_csv: bool = True,
+        logger: logging.Logger = None,
+    ) -> None:
         """Initialize object."""
         if not runset.method == Method.SAMPLE:
             raise ValueError(
                 'Wrong runset method, expecting sample runset, '
                 'found method {}'.format(runset.method)
             )
         self.runset = runset
+        self._logger = logger or get_logger()
         # copy info from runset
         self._is_fixed_param = runset._args.method_args.fixed_param
         self._iter_sampling = runset._args.method_args.iter_sampling
@@ -298,7 +306,9 @@ def __init__(self, runset: RunSet) -> None:
         self._warmup = None
         self._drawset = None
         self._stan_variable_dims = {}
-        self._validate_csv_files()
+        self._validate_csv = validate_csv
+        if validate_csv:
+            self.validate_csv_files()
 
     def __repr__(self) -> str:
         repr = 'CmdStanMCMC: model={} chains={}{}'.format(
@@ -326,11 +336,15 @@ def chain_ids(self) -> List[int]:
     @property
     def num_draws(self) -> int:
         """Number of post-warmup draws per chain."""
+        if not self._validate_csv and self._draws_sampling is None:
+            return int(math.ceil(self._iter_sampling / self._thin))
         return self._draws_sampling
 
     @property
     def num_draws_warmup(self) -> int:
         """Number of warmup draws per chain."""
+        if not self._validate_csv and self._draws_warmup is None:
+            return int(math.ceil(self._iter_warmup / self._thin))
         return self._draws_warmup
 
     @property
@@ -339,6 +353,12 @@ def column_names(self) -> Tuple[str, ...]:
         Names of all per-draw outputs: all
         sampler and model parameters and quantities of interest
         """
+        if not self._validate_csv and len(self._column_names) == 0:
+            self._logger.warning(
+                'csv files not yet validated, run method validate_csv_files()'
+                ' in order to retrieve sample metadata.'
+            )
+            return None
         return self._column_names
 
     @property
@@ -348,6 +368,12 @@ def stan_variable_dims(self) -> Dict:
         Scalar types have int value '1'.  Structured types have list of dims,
         e.g.,  program variable ``vector[10] foo`` has entry ``('foo', [10])``.
         """
+        if not self._validate_csv and len(self._stan_variable_dims) == 0:
+            self._logger.warning(
+                'csv files not yet validated, run method validate_csv_files()'
+                ' in order to retrieve sample metadata.'
+            )
+            return None
         return copy.deepcopy(self._stan_variable_dims)
 
     @property
@@ -356,6 +382,14 @@ def metric_type(self) -> str:
         Metric type used for adaptation, either 'diag_e' or 'dense_e'.
         When sampler algorithm 'fixed_param' is specified, metric_type is None.
         """
+        if self._is_fixed_param:
+            return None
+        if not self._validate_csv and self._metric_type is None:
+            self._logger.warning(
+                'csv files not yet validated, run method validate_csv_files()'
+                ' in order to retrieve sample metadata.'
+            )
+            return None
         return self._metric_type
 
     @property
@@ -364,7 +398,15 @@ def metric(self) -> np.ndarray:
         Metric used by sampler for each chain.
         When sampler algorithm 'fixed_param' is specified, metric is None.
         """
-        if not self._is_fixed_param and self._metric is None:
+        if self._is_fixed_param:
+            return None
+        if not self._validate_csv and self._metric is None:
+            self._logger.warning(
+                'csv files not yet validated, run method validate_csv_files()'
+                ' in order to retrieve sample metadata.'
+            )
+            return None
+        if self._sample is None:
             self._assemble_sample()
         return self._metric
 
@@ -374,7 +416,15 @@ def stepsize(self) -> np.ndarray:
         Stepsize used by sampler for each chain.
         When sampler algorithm 'fixed_param' is specified, stepsize is None.
         """
-        if not self._is_fixed_param and self._stepsize is None:
+        if self._is_fixed_param:
+            return None
+        if not self._validate_csv and self._stepsize is None:
+            self._logger.warning(
+                'csv files not yet validated, run method validate_csv_files()'
+                ' in order to retrieve sample metadata.'
+            )
+            return None
+        if self._sample is None:
             self._assemble_sample()
         return self._stepsize
 
@@ -386,6 +436,8 @@ def sample(self) -> np.ndarray:
         so that the values for each parameter are stored contiguously
         in memory, likewise all draws from a chain are contiguous.
         """
+        if not self._validate_csv and self._sample is None:
+            self.validate_csv_files()
         if self._sample is None:
             self._assemble_sample()
         return self._sample
@@ -400,11 +452,13 @@ def warmup(self) -> np.ndarray:
         """
         if not self._save_warmup:
             return None
+        if not self._validate_csv and self._sample is None:
+            self.validate_csv_files()
         if self._sample is None:
             self._assemble_sample()
         return self._warmup
 
-    def _validate_csv_files(self) -> None:
+    def validate_csv_files(self) -> None:
         """
         Checks that csv output files for all chains are consistent.
         Populates attributes for draws, column_names, num_params, metric_type.
diff --git a/test/test_sample.py b/test/test_sample.py
@@ -258,7 +258,7 @@ def test_multi_proc(self):
         log.check_present(
             (
                 'cmdstanpy',
-                'WARNING',
+                'INFO',
                 'Requesting 7 parallel_chains for 1 chains, '
                 'running all chains in parallel.',
             )
@@ -292,6 +292,9 @@ def test_fixed_param_good(self):
             data=no_data, seed=12345, iter_sampling=100, fixed_param=True
         )
         self.assertEqual(datagen_fit.runset._args.method, Method.SAMPLE)
+        self.assertEqual(datagen_fit.metric_type, None)
+        self.assertEqual(datagen_fit.metric, None)
+        self.assertEqual(datagen_fit.stepsize, None)
 
         for i in range(datagen_fit.runset.chains):
             csv_file = datagen_fit.runset.csv_files[i]
@@ -851,6 +854,66 @@ def test_variables(self):
         self.assertTrue('theta' in vars)
         self.assertEqual(vars['theta'].shape, (20, 4))
 
+    def test_validate(self):
+        stan = os.path.join(DATAFILES_PATH, 'bernoulli.stan')
+        jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
+        bern_model = CmdStanModel(stan_file=stan)
+        bern_fit = bern_model.sample(
+            data=jdata,
+            chains=2,
+            seed=12345,
+            iter_warmup=200,
+            iter_sampling=100,
+            thin=2,
+            save_warmup=True,
+            validate_csv=False,
+        )
+        # check error messages
+        with LogCapture() as log:
+            logging.getLogger()
+            self.assertIsNone(bern_fit.column_names)
+        expect = 'csv files not yet validated'
+        msg = log.actual()[-1][-1]
+        self.assertTrue(msg.startswith(expect))
+
+        with LogCapture() as log:
+            logging.getLogger()
+            self.assertIsNone(bern_fit.stan_variable_dims)
+        expect = 'csv files not yet validated'
+        msg = log.actual()[-1][-1]
+        self.assertTrue(msg.startswith(expect))
+
+        with LogCapture() as log:
+            logging.getLogger()
+            self.assertIsNone(bern_fit.metric_type)
+        expect = 'csv files not yet validated'
+        msg = log.actual()[-1][-1]
+        self.assertTrue(msg.startswith(expect))
+
+        with LogCapture() as log:
+            logging.getLogger()
+            self.assertIsNone(bern_fit.metric)
+        expect = 'csv files not yet validated'
+        msg = log.actual()[-1][-1]
+        self.assertTrue(msg.startswith(expect))
+
+        with LogCapture() as log:
+            logging.getLogger()
+            self.assertIsNone(bern_fit.stepsize)
+        expect = 'csv files not yet validated'
+        msg = log.actual()[-1][-1]
+        self.assertTrue(msg.startswith(expect))
+
+        # check computations match
+        self.assertEqual(bern_fit.num_draws, 50)
+        self.assertEqual(bern_fit.num_draws_warmup, 100)
+        bern_fit.validate_csv_files()
+        self.assertEqual(bern_fit.num_draws, 50)
+        self.assertEqual(bern_fit.num_draws_warmup, 100)
+        self.assertEqual(len(bern_fit.column_names), 8)
+        self.assertEqual(len(bern_fit.stan_variable_dims), 1)
+        self.assertEqual(bern_fit.metric_type, 'diag_e')
+
 
 if __name__ == '__main__':
     unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`"""PyPi Version"""`
`2`	`2`
`3`		`-__version__ = '0.9.62'`
	`3`	`+__version__ = '0.9.63'`