refine feature and model python code by flake8 (#2761)

sneaxiy · web-flow · commit 5c412b411833 · 2020-07-29T10:41:20.000+08:00
diff --git a/python/runtime/feature/__init__.py b/python/runtime/feature/__init__.py
@@ -11,4 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from runtime.feature.derivation import infer_feature_columns
+from runtime.feature.derivation import infer_feature_columns  # noqa: F401
diff --git a/python/runtime/feature/column.py b/python/runtime/feature/column.py
@@ -179,12 +179,13 @@ class CrossColumn(CategoryColumn):
     CrossColumn represents a crossed feature column.
 
     Args:
-        keys (str|NumericColumn): the underlying feature column name or NumericColumn object.
+        keys (str|NumericColumn): the underlying feature column name or
+            NumericColumn object.
         hash_bucket_size (int): the bucket size for hashing.
     """
     def __init__(self, keys, hash_bucket_size):
         for k in keys:
-            assert isinstance(k, six.string_types) or isinstance(k, NumericColumn), \
+            assert isinstance(k, (six.string_types, NumericColumn)), \
                 "keys of CROSS must be of either string or numeric type"
 
         self.keys = keys
@@ -217,12 +218,12 @@ class EmbeddingColumn(FeatureColumn):
     Args:
         category_column (CategoryColumn): the underlying CategoryColumn object.
         dimension (int): the dimension of the embedding.
-        combiner (str): how to reduce if there are multiple entries in a single row.
-            Currently 'mean', 'sqrtn' and 'sum' are supported.
+        combiner (str): how to reduce if there are multiple entries in a single
+            row. Currently 'mean', 'sqrtn' and 'sum' are supported.
         initializer (str): the initializer of the embedding table.
         name (str): only used when category_column=None. In this case, the
-            category_column would be filled automaticaly in the feature derivation
-            stage.
+            category_column would be filled automaticaly in the feature
+            derivation stage.
     """
     def __init__(self,
                  category_column=None,
@@ -266,8 +267,8 @@ class IndicatorColumn(FeatureColumn):
     Args:
         category_column (CategoryColumn): the underlying CategoryColumn object.
         name (str): only used when category_column=None. In this case, the
-            category_column would be filled automaticaly in the feature derivation
-            stage.
+            category_column would be filled automaticaly in the feature
+            derivation stage.
     """
     def __init__(self, category_column=None, name=""):
         if category_column is not None:
diff --git a/python/runtime/feature/derivation.py b/python/runtime/feature/derivation.py
@@ -20,8 +20,7 @@
 import numpy as np
 import six
 from runtime.feature.column import (CategoryIDColumn, EmbeddingColumn,
-                                    FeatureColumn, IndicatorColumn,
-                                    NumericColumn)
+                                    IndicatorColumn, NumericColumn)
 from runtime.feature.field_desc import DataFormat, DataType, FieldDesc
 from runtime.verifier import fetch_samples
 
@@ -38,8 +37,8 @@ def init_column_map(target_fc_map, fc):
     Returns:
         None.
     """
-    if isinstance(fc, (EmbeddingColumn, IndicatorColumn)) and \
-        len(fc.get_field_desc()) == 0:
+    if isinstance(fc, (EmbeddingColumn, IndicatorColumn)) \
+            and len(fc.get_field_desc()) == 0:
         if fc.name not in target_fc_map:
             target_fc_map[fc.name] = []
 
@@ -129,7 +128,7 @@ def new_default_field_desc(name):
 BLANK_PATTERN = re.compile("\\s+")
 
 # The Python 2/3 int64 type
-INT64_TYPE = long if six.PY2 else int
+INT64_TYPE = long if six.PY2 else int  # noqa: F821
 
 
 def infer_string_data_format(str_data):
@@ -165,7 +164,8 @@ def fill_csv_field_desc(cell, field_desc):
     """
     values = cell.split(",")
     if field_desc.is_sparse:
-        assert field_desc.shape is not None, "the shape of CSV format data must be given"
+        assert field_desc.shape is not None, \
+            "the shape of CSV format data must be given"
     else:
         if field_desc.shape is None:
             field_desc.shape = [len(values)]
@@ -174,8 +174,8 @@ def fill_csv_field_desc(cell, field_desc):
         if np.prod(field_desc.shape) != len(values):
             if size > 1:
                 raise ValueError(
-                    "column %s should be csv format dense tensor of %d element(s), but got %d element(s)"
-                    %
+                    "column %s should be csv format dense tensor "
+                    "of %d element(s), but got %d element(s)" %
                     (field_desc.name, np.prod(field_desc.shape), len(values)))
 
             field_desc.shape = [len(values)]
@@ -356,10 +356,13 @@ def update_feature_column(fc, fd_map):
             raise ValueError("column not found or inferred: %s" % fc.name)
 
         # FIXME(typhoonzero): when to use sequence_category_id_column?
-        # if column fieldDesc is SPARSE, the sparse shape should be in cs.Shape[0]
+        # if column fieldDesc is SPARSE, the sparse shape should
+        # be in cs.Shape[0]
         bucket_size = field_desc.shape[0]
         if not field_desc.is_sparse:
-            assert field_desc.max_id > 0, "use dense column on embedding column but did not got a correct MaxID"
+            assert field_desc.max_id > 0, \
+                "use dense column on embedding column " \
+                "but did not got a correct MaxID"
             bucket_size = field_desc.max_id + 1
 
         fc.category_column = CategoryIDColumn(field_desc, bucket_size)
@@ -370,8 +373,10 @@ def update_feature_column(fc, fd_map):
         if field_desc is None:
             raise ValueError("column not found or inferred: %s" % fc.name)
 
-        assert field_desc.is_sparse, "cannot use sparse column with indicator column"
-        assert field_desc.max_id > 0, "use indicator column but did not got a correct MaxID"
+        assert field_desc.is_sparse, \
+            "cannot use sparse column with indicator column"
+        assert field_desc.max_id > 0, \
+            "use indicator column but did not got a correct MaxID"
         bucket_size = field_desc.max_id + 1
         fc.category_column = CategoryIDColumn(field_desc, bucket_size)
 
@@ -392,7 +397,8 @@ def new_feature_column(field_desc):
     else:
         category_column = CategoryIDColumn(field_desc,
                                            len(field_desc.vocabulary))
-        # NOTE(typhoonzero): a default embedding size of 128 is enough for most cases.
+        # NOTE(typhoonzero): a default embedding size of 128 is enough
+        # for most cases.
         embedding = EmbeddingColumn(category_column=category_column,
                                     dimension=128,
                                     combiner="sum")
@@ -406,7 +412,8 @@ def derive_feature_columns(targets, fc_map, fd_map, selected_field_names,
     Derive the FeatureColumn.
 
     Args:
-        targets (list[str]): the feature column targets, e.g. "feature_columns".
+        targets (list[str]): the feature column targets,
+            e.g. "feature_columns".
         fc_map (dict[str -> dict[str -> list[FeatureColumn]]]): a FeatureColumn
             map, where the key of the outer dict is the target name, e.g.
             "feature_columns", and the key of the inner dict is the field name.
@@ -439,7 +446,8 @@ def derive_feature_columns(targets, fc_map, fd_map, selected_field_names,
             match_field_name = None
             for selected_field_name in selected_field_names:
                 if field_pattern.fullmatch(selected_field_name):
-                    assert match_field_name is None, "%s matches duplicate fields" % field_name
+                    assert match_field_name is None, \
+                        "%s matches duplicate fields" % field_name
                     match_field_name = selected_field_name
 
             if match_field_name is None:
@@ -464,8 +472,8 @@ def derive_feature_columns(targets, fc_map, fd_map, selected_field_names,
                     update_feature_column(fc, fd_map)
             else:
                 if len(fc_map) > 1:
-                    # if column clause have more than one target, each target should specify the
-                    # full list of the columns to use.
+                    # if column clause have more than one target, each target
+                    # should specify the full list of the columns to use.
                     continue
 
                 field_desc = fd_map[selected_field_name]
@@ -479,13 +487,13 @@ def derive_feature_columns(targets, fc_map, fd_map, selected_field_names,
         fc_target_map.update(new_fc_target_map)
 
 
-def update_ir_feature_column_map_by_derived_feature_column_map(
-    features, fc_map, selected_field_names, label_name):
+def update_ir_feature_columns(features, fc_map, selected_field_names,
+                              label_name):
     """
     Update the IR FeatureColumn map `features` by the derived FeatureColumn map
-    `fc_map` . If any FeatureColumn inside `fc_map` does not exist in `features`,
-    it would be added to `features` . Notice that `features` is not updated
-    in-place, and we would return a new updated IR FeatureColumn map in
+    `fc_map` . If any FeatureColumn inside `fc_map` does not exist in
+    `features`, it would be added to `features` . Notice that `features` is not
+    updated in-place, and we would return a new updated IR FeatureColumn map in
     this method.
 
     Args:
@@ -542,9 +550,8 @@ def update_ir_feature_column_map_by_derived_feature_column_map(
                         break
 
                 if not found:
-                    raise ValueError(
-                        "some feature column is missing in the derivation stage"
-                    )
+                    raise ValueError("some feature column is missing in the "
+                                     "derivation stage")
 
             sorted_pos = sorted(range(len(indices)), key=lambda k: indices[k])
             multi_fd_fcs = [multi_fd_fcs[i] for i in sorted_pos]
@@ -572,9 +579,11 @@ def derive_label(label, fd_map):
         return  # NOTE: clustering model may not specify Label
 
     label_field_desc = fd_map[label_name]
-    assert label_field_desc is not None, "deriveLabel: LABEL COLUMN '%s' not found" % label_name
+    assert label_field_desc is not None, \
+        "deriveLabel: LABEL COLUMN '%s' not found" % label_name
 
-    # use shape [] if label shape is [1] for Tensorflow scalar label shape should be [].
+    # use shape [] if label shape is [1] for Tensorflow scalar label
+    # shape should be [].
     shape = label_field_desc.shape
     if shape is None or (len(shape) == 1 and shape[0] == 1):
         label_field_desc.shape = []
@@ -626,7 +635,7 @@ def infer_feature_columns(conn, select, features, label, n=1000):
 
     derive_feature_columns(targets, fc_map, fd_map, selected_field_names,
                            label_name)
-    features = update_ir_feature_column_map_by_derived_feature_column_map(
-        features, fc_map, selected_field_names, label_name)
+    features = update_ir_feature_columns(features, fc_map,
+                                         selected_field_names, label_name)
     label = derive_label(label, fd_map)
     return features, label
diff --git a/python/runtime/feature/derivation_test.py b/python/runtime/feature/derivation_test.py
@@ -16,8 +16,7 @@
 import runtime.feature.derivation as fd
 import runtime.testing as testing
 from runtime.feature.column import (CategoryIDColumn, CrossColumn,
-                                    EmbeddingColumn, IndicatorColumn,
-                                    NumericColumn)
+                                    EmbeddingColumn, NumericColumn)
 from runtime.feature.field_desc import DataFormat, DataType, FieldDesc
 
 
@@ -103,7 +102,8 @@ def test_without_cross(self):
         label = NumericColumn(
             FieldDesc(name="class", dtype=DataType.INT, shape=[1]))
 
-        select = "select c1, c2, c3, c4, c5, c6, class from feature_derivation_case.train"
+        select = "select c1, c2, c3, c4, c5, c6, class " \
+                 "from feature_derivation_case.train"
         conn = testing.get_singleton_db_connection()
         features, label = fd.infer_feature_columns(conn, select, features,
                                                    label)
@@ -218,7 +218,8 @@ def test_with_cross(self):
 
         label = NumericColumn(
             FieldDesc(name='class', dtype=DataType.INT, shape=[1]))
-        select = "select c1, c2, c3, c4, c5, class from feature_derivation_case.train"
+        select = "select c1, c2, c3, c4, c5, class " \
+                 "from feature_derivation_case.train"
 
         conn = testing.get_singleton_db_connection()
         features, label = fd.infer_feature_columns(conn, select, features,
diff --git a/python/runtime/feature/field_desc.py b/python/runtime/feature/field_desc.py
@@ -52,8 +52,10 @@ class FieldDesc(object):
             PLAIN, CSV, KV. Default PLAIN.
         shape (list[int]): the shape of the field data. Default None.
         is_sparse (bool): whether the field data is sparse. Default False.
-        vocabulary (list[str]): the vocabulary used for categorical feature column. Default None.
-        max_id (int): the maximum id number of the field data. Used in CategoryIDColumn. Default 0.
+        vocabulary (list[str]): the vocabulary used for categorical
+            feature column. Default None.
+        max_id (int): the maximum id number of the field data. Used in
+            CategoryIDColumn. Default 0.
     """
     def __init__(self,
                  name="",
diff --git a/python/runtime/model/__init__.py b/python/runtime/model/__init__.py
@@ -11,4 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from runtime.model.model import EstimatorType, Model, load
+from runtime.model.model import EstimatorType, Model, load  # noqa: F401
diff --git a/python/runtime/model/model.py b/python/runtime/model/model.py
@@ -36,7 +36,8 @@ class EstimatorType(Enum):
     # To stay compitable with old models, we start at 0
     TENSORFLOW = 0
     XGBOOST = 1
-    # PAIML is the model type that trained by PAI machine learning algorithm toolkit
+    # PAIML is the model type that trained by PAI machine learning algorithm
+    # toolkit
     PAIML = 2
 
 
diff --git a/python/runtime/model/tar_test.py b/python/runtime/model/tar_test.py
@@ -12,7 +12,6 @@
 # limitations under the License.
 
 import os
-import shutil
 import tempfile
 import unittest