Add token validation check to math programming (#2748)

sneaxiy · web-flow · commit 9a34bd4f140e · 2020-07-27T22:39:41.000+08:00
* add error diagnostic to optimize model generation

* fix ut bug

* polish

* polish

* follow comments
diff --git a/python/runtime/optimize/model_generation.py b/python/runtime/optimize/model_generation.py
@@ -12,12 +12,56 @@
 # limitations under the License.
 
 import copy
+import re
 
 __all__ = [
     'generate_unique_result_value_name',
     'generate_objective_and_constraint_expression',
 ]
 
+IDENTIFIER_REGEX = re.compile("[_a-zA-Z]\w*")
+
+
+def assert_are_valid_tokens(columns, tokens, result_value_name, group_by=None):
+    """
+    Check whether the tokens are valid. If the token is inside
+    columns or result_value_name, or the token is a function-call
+    identifier, it is valid. Otherwise, raise AssertionError.
+
+    Args:
+        columns (list[str]): the column names of the source table.
+        tokens (list[str]): the token list.
+        result_value_name (str): the result value name to be optimized.
+        group_by (str): the column name to be grouped.
+
+    Returns:
+        None
+
+    Raises:
+        AssertionError if any token is invalid.
+    """
+    valid_columns = [c.lower() for c in columns]
+
+    if group_by:
+        assert group_by.lower(
+        ) in valid_columns, "GROUP BY column %s not found" % group_by
+
+    assert tokens, "tokens should not be empty"
+
+    valid_columns.append(result_value_name.lower())
+
+    for i, token in enumerate(tokens):
+        if token.lower() in valid_columns:
+            continue
+
+        # If a token is not a function call identifier and not inside
+        # valid_columns, raise error
+        if IDENTIFIER_REGEX.fullmatch(token) is None:
+            continue
+
+        assert find_next_non_blank_token(tokens, i + 1) == "(", \
+                "invalid token %s" % token
+
 
 def generate_unique_result_value_name(columns, result_value_name, variables):
     """
@@ -184,6 +228,30 @@ def generate_group_by_range_and_index_str(group_by, data_str, value_str,
     return outer_range_str, inner_range_str, [value_str, index_str]
 
 
+def find_next_non_blank_token(tokens, i):
+    """
+    Find next non-blank token after index i (including i).
+
+    Args:
+        tokens (list[str]): a string token list.
+        i (int): the position to search.
+
+    Returns:
+        If any token is found, return the found token.
+        Otherwise, return None.
+    """
+    if i < 0:
+        return None
+
+    while i < len(tokens):
+        if tokens[i].strip():
+            return tokens[i]
+
+        i += 1
+
+    return None
+
+
 def find_prev_non_blank_token(tokens, i):
     """
     Find previous non-blank token before index i (including i).
@@ -585,6 +653,9 @@ def generate_objective_and_constraint_expression(columns,
     constraint_exprs = []
 
     if objective:
+        assert_are_valid_tokens(columns=columns,
+                                tokens=objective,
+                                result_value_name=result_value_name)
         obj_expr, for_range, iter_vars = generate_objective_or_constraint_expression(
             columns=columns,
             tokens=objective,
@@ -603,6 +674,10 @@ def generate_objective_and_constraint_expression(columns,
             tokens = c.get("tokens")
             group_by = c.get("group_by")
 
+            assert_are_valid_tokens(columns=columns,
+                                    tokens=tokens,
+                                    result_value_name=result_value_name,
+                                    group_by=group_by)
             expr, for_range, iter_vars = generate_objective_or_constraint_expression(
                 columns=columns,
                 tokens=tokens,
diff --git a/python/runtime/optimize/model_generation_test.py b/python/runtime/optimize/model_generation_test.py
@@ -18,8 +18,68 @@
 import pandas as pd
 import pyomo.environ as pyomo_env
 from runtime.optimize.local import generate_model_with_data_frame, solve_model
-from runtime.optimize.model_generation import \
-    generate_objective_and_constraint_expression
+from runtime.optimize.model_generation import (
+    IDENTIFIER_REGEX, assert_are_valid_tokens,
+    generate_objective_and_constraint_expression)
+
+
+class TestAssertValidTokens(unittest.TestCase):
+    def is_identifier(self, token):
+        return IDENTIFIER_REGEX.fullmatch(token) is not None
+
+    def test_is_identifier(self):
+        tokens = ['a', '_', 'a123', '__', '_123']
+        for t in tokens:
+            self.assertTrue(self.is_identifier(t))
+
+        tokens = ['1', '123_', '3def']
+        for t in tokens:
+            self.assertFalse(self.is_identifier(t))
+
+    def test_assert_valid_tokens(self):
+        tokens = ['SUM', '(', 'finishing', '*', 'product', ')', '<=', '100']
+
+        # valid expression
+        assert_are_valid_tokens(columns=['finishing', 'product'],
+                                tokens=tokens,
+                                result_value_name='product')
+
+        # invalid group_by
+        with self.assertRaises(AssertionError):
+            assert_are_valid_tokens(columns=['finishing', 'product'],
+                                    tokens=tokens,
+                                    result_value_name='product',
+                                    group_by='invalid_group_by')
+
+        # tokens = None
+        with self.assertRaises(AssertionError):
+            assert_are_valid_tokens(columns=['finishing', 'product'],
+                                    tokens=None,
+                                    result_value_name='product')
+
+        # tokens = []
+        with self.assertRaises(AssertionError):
+            assert_are_valid_tokens(columns=['finishing', 'product'],
+                                    tokens=[],
+                                    result_value_name='product')
+
+        # tokens not inside columns
+        tokens = [
+            'SUM', '(', 'finishing', '*', 'invalid_token', ')', '<=', '100'
+        ]
+        with self.assertRaises(AssertionError):
+            assert_are_valid_tokens(columns=['finishing', 'product'],
+                                    tokens=tokens,
+                                    result_value_name='product')
+
+        # tokens not inside columns but equal to result_value_name
+        # ignore cases
+        tokens = [
+            'SUM', '(', 'FinisHing', '*', 'pRoducT_VaLue', ')', '<=', '100'
+        ]
+        assert_are_valid_tokens(columns=['finishing', 'product'],
+                                tokens=tokens,
+                                result_value_name='product_value')
 
 
 class TestModelGenerationBase(unittest.TestCase):
@@ -72,7 +132,7 @@ def replace_objective_token(self, objective, old, new):
 
     def replace_constraint_token(self, constraint, old, new):
         def replace_one_constraint(c):
-            c = copy.copy(c)
+            c = copy.deepcopy(c)
             for i, token in enumerate(c["tokens"]):
                 if token == old:
                     c["tokens"][i] = new