Merge branch 'master' into diff_input_processing_recoding

jmarinllao · web-flow · commit d78e99eb4815 · 2020-05-11T09:05:52.000+02:00
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -57,8 +57,6 @@
 intersphinx_mapping = {
     'python': ('https://docs.python.org/3', None),
     'networkx': ('https://networkx.github.io/documentation/stable', None),
-    'sqlalchemy': ('https://docs.sqlalchemy.org/en/13/', None),
-    'pybel': ('https://pybel.readthedocs.io/en/latest/', None),
 }
 
 autodoc_member_order = 'bysource'
diff --git a/docs/source/intro.rst b/docs/source/intro.rst
@@ -14,6 +14,7 @@ You can submit your dataset in any of the following formats:
 Please ensure that the dataset minimally has a column 'Node' containing node IDs. You can also optionally add the
 following columns to your dataset:
 
+- NodeType
 - LogFC [*]_
 - p-value
 
@@ -42,39 +43,53 @@ details.
 |      D     |
 +------------+
 
-2. You can also choose to provide a dataset with a column 'Node' containing node IDs as well as a column 'logFC' with
-their abs(LogFC).
+2. You can also provide a dataset with a column 'Node' containing node IDs as well as a column 'NodeType', indicating
+the entity type of the node to run diffusion by entity type.
+
++------------+--------------+
+|     Node   |   NodeType   |
++============+==============+
+|      A     |     Gene     |
++------------+--------------+
+|      B     |     Gene     |
++------------+--------------+
+|      C     |  Metabolite  |
++------------+--------------+
+|      D     |    Gene      |
++------------+--------------+
+
+3. You can also choose to provide a dataset with a column 'Node' containing node IDs as well as a column 'logFC' with
+their logFC. You may also add a 'NodeType' column to run diffusion by entity type.
 
 +--------------+------------+
 | Node         |   LogFC    |
 +==============+============+
-| Gene A       | 4          |
+|      A       | 4          |
 +--------------+------------+
-| Gene  B      | -1         |
+|      B       | -1         |
 +--------------+------------+
-| Metabolite C | 1.5        |
+|      C       | 1.5        |
 +--------------+------------+
-| Gene D       | 3          |
+|      D       | 3          |
 +--------------+------------+
 
-3. Finally, you can provide a dataset with a column 'Node' containing node IDs, a column 'logFC' with their abs(LogFC)
-and a column 'p-value' with adjusted p-values.
+4. Finally, you can provide a dataset with a column 'Node' containing node IDs, a column 'logFC' with their logFC and a
+column 'p-value' with adjusted p-values. You may also add a 'NodeType' column to run diffusion by entity type.
 
 +--------------+------------+---------+
 | Node         |   LogFC    | p-value |
 +==============+============+=========+
-| Gene A       | 4          | 0.03    |
+|      A       | 4          | 0.03    |
 +--------------+------------+---------+
-| Gene  B      | -1         | 0.05    |
+|      B       | -1         | 0.05    |
 +--------------+------------+---------+
-| Metabolite C | 1.5        | 0.001   |
+|      C       | 1.5        | 0.001   |
 +--------------+------------+---------+
-| Gene D       | 3          | 0.07    |
+|      D       | 3          | 0.07    |
 +--------------+------------+---------+
 
-You can also take a look at our `sample datasets <https://github.com/multipaths/DiffuPy/tree/master/examples/datasets>`_
-folder for some examples files.
-
+See the `sample datasets <https://github.com/multipaths/DiffuPy/tree/master/examples/datasets>`_ directory for example
+files.
 
 Networks
 --------
@@ -119,13 +134,13 @@ Custom-network example
 ~~~~~~~~~~~~~~~~~~~~~~
 
 +-----------+--------------+-------------+
-|  Source   |   Target     | Relation    |
+|  Source   |   Target     |  Relation   |
 +===========+==============+=============+
-| Gene A    | Gene B       | Increase    |
+|     A     |      B       | Increase    |
 +-----------+--------------+-------------+
-| Gene B    | Metabolite C | Association |
+|     B     |      C       | Association |
 +-----------+--------------+-------------+
-| Gene A    | Pathology D  | Association |
+|     A     |      D       | Association |
 +-----------+--------------+-------------+
 
 You can also take a look at our `sample networks <https://github.com/multipaths/DiffuPy/tree/master/examples/networks>`_
diff --git a/examples/README.rst b/examples/README.rst
@@ -11,6 +11,7 @@ You can submit your dataset in any of the following formats:
 Please ensure that the dataset minimally has a column 'Node' containing node IDs. You can also optionally add the
 following columns to your dataset:
 
+- NodeType
 - LogFC [*]_
 - p-value
 
@@ -39,36 +40,49 @@ details.
 |      D     |
 +------------+
 
-2. You can also choose to provide a dataset with a column 'Node' containing node IDs as well as a column 'logFC' with
-their | logFC |.
+2. You can also provide a dataset with a column 'Node' containing node IDs as well as a column 'NodeType', indicating
+the entity type of the node to run diffusion by entity type.
+
++------------+--------------+
+|     Node   |   NodeType   |
++============+==============+
+|      A     |     Gene     |
++------------+--------------+
+|      B     |     Gene     |
++------------+--------------+
+|      C     |  Metabolite  |
++------------+--------------+
+|      D     |    Gene      |
++------------+--------------+
+
+3. You can also choose to provide a dataset with a column 'Node' containing node IDs as well as a column 'logFC' with
+their logFC. You may also add a 'NodeType' column to run diffusion by entity type.
 
 +--------------+------------+
 | Node         |   LogFC    |
 +==============+============+
-| Gene A       | 4          |
+|      A       | 4          |
 +--------------+------------+
-| Gene  B      | -1         |
+|      B       | -1         |
 +--------------+------------+
-| Metabolite C | 1.5        |
+|      C       | 1.5        |
 +--------------+------------+
-| Gene D       | 3          |
+|      D       | 3          |
 +--------------+------------+
 
-.. | logFC | replace:: Log\ :sub:`2`\ FC
-
-3. Finally, you can provide a dataset with a column 'Node' containing node IDs, a column 'logFC' with their | logFC | and
-a column 'p-value' with adjusted p-values.
+4. Finally, you can provide a dataset with a column 'Node' containing node IDs, a column 'logFC' with their logFC and a
+column 'p-value' with adjusted p-values. You may also add a 'NodeType' column to run diffusion by entity type.
 
 +--------------+------------+---------+
 | Node         |   LogFC    | p-value |
 +==============+============+=========+
-| Gene A       | 4          | 0.03    |
+|      A       | 4          | 0.03    |
 +--------------+------------+---------+
-| Gene  B      | -1         | 0.05    |
+|      B       | -1         | 0.05    |
 +--------------+------------+---------+
-| Metabolite C | 1.5        | 0.001   |
+|      C       | 1.5        | 0.001   |
 +--------------+------------+---------+
-| Gene D       | 3          | 0.07    |
+|      D       | 3          | 0.07    |
 +--------------+------------+---------+
 
 See the `sample datasets <https://github.com/multipaths/DiffuPy/tree/master/examples/datasets>`_ directory for example
@@ -118,11 +132,11 @@ Custom-network example
 +-----------+--------------+-------------+
 |  Source   |   Target     |  Relation   |
 +===========+==============+=============+
-| Gene A    | Gene B       | Increase    |
+|     A     |      B       | Increase    |
 +-----------+--------------+-------------+
-| Gene B    | Metabolite C | Association |
+|     B     |      C       | Association |
 +-----------+--------------+-------------+
-| Gene A    | Pathology D  | Association |
+|     A     |      D       | Association |
 +-----------+--------------+-------------+
 
 See the `sample networks <https://github.com/multipaths/DiffuPy/tree/master/examples/networks>`_ directory for some
diff --git a/examples/datasets/node_type.csv b/examples/datasets/node_type.csv
@@ -0,0 +1,8 @@
+Node,NodeType
+A,Gene
+B,Gene
+C,Metabolite
+D,Gene
+E,Metabolite
+F,Gene
+G,Pathology
diff --git a/examples/networks/sample_network.csv b/examples/networks/sample_network.csv
@@ -1,4 +1,5 @@
 Source,Target,Relation
-Gene A,Gene B,Increase
-Gene B,Metabolite C,Association
-Gene A,Pathology D,Association
+A, B, Increase
+B, C, Association
+A, D, Association
+
diff --git a/setup.cfg b/setup.cfg
@@ -7,7 +7,7 @@ version = 0.0.5-dev
 description = Compute diffusion scores over networks
 long_description = file: README.rst
 
-# URLs associated with PyBEL
+# URLs associated with DiffuPy
 url = https://github.com/multipaths/DiffuPy
 download_url = https://github.com/multipaths/DiffuPy
 project_urls =
diff --git a/src/diffupy/cli.py b/src/diffupy/cli.py
@@ -9,8 +9,8 @@
 import time
 
 import click
-from diffupath.constants import OUTPUT_DIR
-from diffupy.process_network import get_kernel_from_network_path
+
+from .process_network import get_kernel_from_network_path
 
 from .constants import OUTPUT, METHODS, EMOJI, RAW, CSV, JSON
 from .diffuse import diffuse as run_diffusion
@@ -93,7 +93,7 @@ def kernel(
     '-o', '--output',
     type=click.File('w'),
     help="Output file",
-    default=OUTPUT_DIR,
+    default=OUTPUT,
 )
 @click.option(
     '-m', '--method',
@@ -141,7 +141,7 @@ def kernel(
 def diffuse(
         input: str,
         network: str,
-        output: str = OUTPUT_DIR,
+        output: str = OUTPUT,
         method: str = RAW,
         binarize: bool = False,
         threshold: float = None,
diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py
@@ -266,6 +266,32 @@ def _codify_input_data(
                                     threshold
                                     )
 
+    # Standardize the title of the node column labeling column to 'Label', for later processing.
+    if LABEL not in df.columns:
+        for l in list(df.columns):
+            if l in NODE_LABELING:
+                df = df.rename(columns={l: LABEL})
+                break
+
+    # If node type provided in a column, classify in a dictionary the input codification by its node type.
+    if NODE_TYPE in df.columns:
+
+        node_types = list(set(df[NODE_TYPE]))  # Get the node types list set.
+        codified_by_type_dict = {}
+
+        for node_type in node_types:
+            # Filter the nodes by the iterable type.
+            df_by_type = df.loc[df[NODE_TYPE] == node_type]
+
+            # Codify the nodes for the iterable type.
+            codified_by_type_dict[node_type] = _codify_method_check(df_by_type,
+                                                                    method,
+                                                                    binning,
+                                                                    absolute_value,
+                                                                    p_value,
+                                                                    threshold
+                                                                    )
+        return codified_by_type_dict
 
 def _codify_method_check(
         df: pd.DataFrame,
@@ -587,6 +613,7 @@ def mapping_statistics(
 
             total_mapping.update(mapping)
 
+
         if subtotals:
             statistics_dict['total_mapping'] = total_mapping
             statistics_dict['total_input'] = total_input
@@ -755,7 +782,7 @@ def _map_label_dict(
             label_bck = _check_label_to_background_labels(label, background_labels, check_substrings)
             if label_bck is not None:
                 mapped_dict[label_bck] = v
-
+   
         elif isinstance(label, set) or isinstance(label, tuple) or isinstance(label, list):
             for sublabel in set(label):
                 label_bck = _check_label_to_background_labels(sublabel, background_labels, check_substrings)
diff --git a/src/diffupy/utils.py b/src/diffupy/utils.py
@@ -24,6 +24,7 @@
 """Matrix/graph handling utils."""
 
 
+
 def get_laplacian(graph: Graph, normalized: bool = False) -> np.ndarray:
     """Return Laplacian matrix."""
     if nx.is_directed(graph):
@@ -137,18 +138,16 @@ def print_dict_dimensions(entities_db, title='Title', message=''):
     """Print dimension of the dictionary."""
     total = 0
     m = f'{title}\n'
-
     for k1, v1 in entities_db.items():
         m += f'\n{message}{k1}:\n'
         if isinstance(v1, dict):
             for k2, v2 in v1.items():
                 m += f'{k2}  ({v2})\n'
         else:
             m += f'{v1}'
-
+            
     print(f'{m}\n\n')
 
-
 def log_dict(dict_to_print: dict, message: str = ''):
     """Print dictionary as list with a message."""
     for k1, v1 in dict_to_print.items():
@@ -301,6 +300,12 @@ def munge_cell(cell):
     else:
         raise TypeError(f'The cell "{cell}" could not be processed.')
 
+def parse_xls_sheet_to_df(sheet: opxl.workbook,
+                          min_row: Optional[int] = 1,
+                          relevant_cols: Optional[list] = None,
+                          irrelevant_cols: Optional[list] = None) -> pd.DataFrame:
+    """Process/format excel sheets to DataFrame."""
+    parsed_sheet_dict = {}
 
 def parse_xls_sheet_to_df(sheet: opxl.workbook,
                           min_row: Optional[int] = 1,
diff --git a/tests/resources/datasets/node_type_col.csv b/tests/resources/datasets/node_type_col.csv
@@ -1,6 +1,8 @@
-NodeType,Node
-Gene,A
-Gene,B
-Metabolite,C
-Gene,D
-Gene,E
+Node,NodeType
+A,Gene
+B,Gene
+C,Metabolite
+D,Gene
+E,Metabolite
+F,Gene
+G,Pathology
diff --git a/tests/test_input.py b/tests/test_input.py
@@ -181,8 +181,6 @@ def test_map_labels_input_type_dict_label_scores_dict_background_two_dimensional
                                    background_labels={'db1': {'Gene': ['A', 'B']}, 'db2': {'Metabolite': ['C']}},
                                    show_descriptive_stat=True)
 
-        print(mapping)
-
     def test_network(self):
         """Test generate graph from csv."""
         graph = get_graph_from_df(NETWORK_PATH, CSV)
@@ -273,7 +271,7 @@ def test_format_input_for_diffusion_label_list(self):
             map_labels_input({'Metabolite': {'C': -1}, 'Gene': {'A': 2, 'B': 1}, 'mirnas': {'A': 1, 'T': 1}},
                              self.kernel_test_1.rows_labels),
             self.kernel_test_1,
-        )
+        
 
         # TODO: Implement in Matrix equal, now if the col order is mixed it raises error
         # assert(np.allclose(processed_mapped_nodes_list.mat,

Original file line number	Diff line number	Diff line change
`@@ -57,8 +57,6 @@`
`57`	`57`	`intersphinx_mapping = {`
`58`	`58`	`'python': ('https://docs.python.org/3', None),`
`59`	`59`	`'networkx': ('https://networkx.github.io/documentation/stable', None),`
`60`		`- 'sqlalchemy': ('https://docs.sqlalchemy.org/en/13/', None),`
`61`		`- 'pybel': ('https://pybel.readthedocs.io/en/latest/', None),`
`62`	`60`	`}`
`63`	`61`
`64`	`62`	`autodoc_member_order = 'bysource'`