Skip to content

Commit 6411a91

Browse files
authored
Diff input processing recoding (#19)
* Process network major recoding * Process data input major recoding and tests * Diffupy Process data input utils * Format inputs refactor and tested * Mapping subsets labels, implemented as _map_label_dict and _map_label_list * Parse xls added to diffuPy utils and as a process input option * General refactors and documentation in process_input * General refator in imports and function naming updates in diffuPy package * Added feature rename dataframe column titles according (if) provided label_mapping * Excel parser refactor after testing * Process input refactor and process substrings feature * Show mapping statistics feature added * flake8 cleaning in diffupy * flake8 cleaning in diffupy * flake8 cleaning in diffupy * diffupy cli refactor and output format feature added * openpyxl dependence
1 parent d677230 commit 6411a91

12 files changed

Lines changed: 1410 additions & 348 deletions

File tree

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ install_requires =
5252
scipy
5353
pybel==0.13.2
5454
pandas
55+
openpyxl
5556

5657
# Random options
5758
zip_safe = false

src/diffupy/cli.py

Lines changed: 54 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# -*- coding: utf-8 -*-
22

3-
"""Command line interface for DiffuPy."""
3+
"""Command line interface for diffuPy."""
44

55
import json
66
import logging
@@ -10,12 +10,13 @@
1010
import time
1111

1212
import click
13+
from diffupy.process_network import get_kernel_from_network_path
1314

14-
from .constants import OUTPUT, METHODS, EMOJI
15+
from .constants import OUTPUT, METHODS, EMOJI, RAW, CSV, JSON
1516
from .diffuse import diffuse as run_diffusion
1617
from .kernels import regularised_laplacian_kernel
17-
from .process_input import process_input
18-
from .utils import process_network_from_cli
18+
from .process_input import process_map_and_format_input_data_for_diff
19+
from .process_network import process_graph_from_file
1920

2021
logger = logging.getLogger(__name__)
2122

@@ -42,9 +43,9 @@ def main():
4243
)
4344
@click.option('-l', '--log', is_flag=True, help='Activate debug mode')
4445
def kernel(
45-
network: str,
46-
output: str = OUTPUT,
47-
log: bool = None
46+
graph: str,
47+
output: str = OUTPUT,
48+
log: bool = None
4849
):
4950
"""Generate a kernel for a given network."""
5051
# Configure logging level
@@ -55,20 +56,20 @@ def kernel(
5556
logging.basicConfig(level=logging.INFO)
5657
logger.setLevel(logging.INFO)
5758

58-
click.secho(f'{EMOJI} Loading graph from {network} {EMOJI}')
59+
click.secho(f'{EMOJI} Loading graph from {graph} {EMOJI}')
5960

60-
graph = process_network_from_cli(network)
61+
graph = process_graph_from_file(graph)
6162

62-
click.secho(f'{EMOJI} Calculating regularized Laplacian kernel. This might take a while... {EMOJI}')
63+
click.secho(f'{EMOJI} Generating regularized Laplacian kernel from graph. This might take a while... {EMOJI}')
6364
exe_t_0 = time.time()
64-
background_mat = regularised_laplacian_kernel(graph)
65+
kernel = regularised_laplacian_kernel(graph)
6566
exe_t_f = time.time()
6667

67-
output_file = os.path.join(output, f'{network.split("/")[-1]}.pickle')
68+
output_file = os.path.join(output, f'{graph.split("/")[-1]}.pickle')
6869

6970
# Export numpy array
7071
with open(output_file, 'wb') as file:
71-
pickle.dump(background_mat, file, protocol=4)
72+
pickle.dump(kernel, file, protocol=4)
7273

7374
running_time = exe_t_f - exe_t_0
7475

@@ -77,14 +78,14 @@ def kernel(
7778

7879
@main.command()
7980
@click.option(
80-
'-n', '--network',
81-
help='Path to the network graph or kernel',
81+
'-i', '--input',
82+
help='Input data',
8283
required=True,
8384
type=click.Path(exists=True, dir_okay=False)
8485
)
8586
@click.option(
86-
'-i', '--data',
87-
help='Input data',
87+
'-n', '--network',
88+
help='Path to the network graph or kernel',
8889
required=True,
8990
type=click.Path(exists=True, dir_okay=False)
9091
)
@@ -98,7 +99,7 @@ def kernel(
9899
'-m', '--method',
99100
help='Diffusion method',
100101
type=click.Choice(METHODS),
101-
required=True,
102+
default=RAW,
102103
)
103104
@click.option(
104105
'-b', '--binarize',
@@ -112,6 +113,7 @@ def kernel(
112113
@click.option(
113114
'-t', '--threshold',
114115
help='Codify node labels by applying a threshold to logFC in input.',
116+
default=None,
115117
type=float,
116118
)
117119
@click.option(
@@ -129,48 +131,55 @@ def kernel(
129131
default=0.05,
130132
show_default=True,
131133
)
134+
@click.option(
135+
'-f', '--output_format',
136+
help='Statistical significance (p-value).',
137+
type=float,
138+
default=CSV,
139+
show_default=True,
140+
)
132141
def diffuse(
133-
network: str,
134-
data: str,
135-
output: str,
136-
method: str,
137-
binarize: bool,
138-
absolute_value: bool,
139-
threshold: float,
140-
p_value: float,
142+
input: str,
143+
network: str,
144+
output: str = sys.stdout,
145+
method: str = RAW,
146+
binarize: bool = True,
147+
threshold: float = None,
148+
absolute_value: bool = True,
149+
p_value: float = 0.05,
150+
output_format: str = CSV
141151
):
142152
"""Run a diffusion method over a network or pre-generated kernel."""
143153
click.secho(f'{EMOJI} Loading graph from {network} {EMOJI}')
144-
graph = process_network_from_cli(network)
145154

146-
click.secho(
147-
f'{EMOJI} Graph loaded with: \n'
148-
f'{graph.number_of_nodes()} nodes\n'
149-
f'{graph.number_of_edges()} edges\n'
150-
f'{EMOJI}'
151-
)
155+
kernel = get_kernel_from_network_path(network)
152156

153-
click.secho(f'Codifying data from {data}.')
157+
click.secho(f'Processing data input from {input}.')
154158

155-
label_dict = process_input(data, method, binarize, absolute_value, p_value, threshold)
159+
input_scores_dict = process_map_and_format_input_data_for_diff(input,
160+
kernel,
161+
method,
162+
binarize,
163+
absolute_value,
164+
p_value,
165+
threshold,
166+
)
156167

157-
click.secho(f'Running the diffusion algorithm.')
168+
click.secho(f'Computing the diffusion algorithm.')
158169

159170
results = run_diffusion(
160-
label_dict,
171+
input_scores_dict,
161172
method,
162-
graph,
173+
k=kernel
163174
)
164175

165-
# results = run_diffusion(
166-
# label_dict,
167-
# method,
168-
# graph,
169-
# )
176+
if output_format is CSV:
177+
results.to_csv(output)
170178

171-
# json.dump(results, output, indent=2)
179+
elif output_format is JSON:
180+
json.dump(results, output, indent=2)
172181

173-
click.secho(f'Finished!')
182+
click.secho(f'{EMOJI} Diffusion performed with success. Output located at {output} {EMOJI}')
174183

175184

176185
if __name__ == '__main__':

src/diffupy/constants.py

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,30 +59,47 @@ def ensure_output_dirs():
5959

6060
#: csv
6161
CSV = 'csv'
62+
#: xml
63+
XLS = 'xls'
64+
#: xmls
65+
XLSX = 'xlsx'
6266
#: tsv
6367
TSV = 'tsv'
6468
#: graphML
6569
GRAPHML = 'graphml'
6670
#: bel
6771
BEL = 'bel'
6872
#: node link json
69-
NODE_LINK_JSON = 'json'
73+
JSON = 'json'
7074
#: pickle
71-
BEL_PICKLE = 'pickle'
75+
PICKLE = 'pickle'
7276
#: gml
7377
GML = 'gml'
7478
#: edge list
7579
EDGE_LIST = '.lst'
7680

77-
#: DiffuPath available network formats
78-
FORMATS = [
81+
XLS_FORMATS = (
82+
XLS,
83+
XLSX
84+
)
85+
86+
#: Available graph formats
87+
GRAPH_FORMATS = (
7988
CSV,
8089
TSV,
8190
GRAPHML,
8291
BEL,
83-
NODE_LINK_JSON,
84-
BEL_PICKLE,
85-
]
92+
JSON,
93+
PICKLE,
94+
)
95+
96+
#: Available kernel formats
97+
KERNEL_FORMATS = (
98+
CSV,
99+
TSV,
100+
JSON,
101+
PICKLE,
102+
)
86103

87104
#: Separators
88105
FORMAT_SEPARATOR_MAPPING = {
@@ -109,9 +126,22 @@ def ensure_output_dirs():
109126

110127
#: Node name
111128
NODE = 'Node'
129+
LABEL = 'Label'
130+
ENTITY = 'Entity'
131+
GENE = 'Gene'
132+
133+
NODE_LABELING = [
134+
NODE,
135+
LABEL,
136+
ENTITY,
137+
GENE
138+
]
139+
140+
#: Node type
141+
NODE_TYPE = 'NodeType'
142+
#: Unspecified score type
143+
SCORE = 'Score'
112144
#: Log2 fold change (logFC)
113145
LOG_FC = 'LogFC'
114146
#: Statistical significance (p-value)
115147
P_VALUE = 'p-value'
116-
#: Label
117-
LABEL = 'Label'

src/diffupy/diffuse.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ def diffuse(
5151
) -> Matrix:
5252
"""Run diffusion on a network given an input and a diffusion method.
5353
54-
:param input_scores: score collection, supplied as n-dimensional array. Could be 1-dimensional (List) or n-dimensional (Matrix).
55-
:param method: Selected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"]
54+
:param input_scores: score collection, supplied as n-dimensional array. Could be 1-dimensional (Vector) or n-dimensional (Matrix).
55+
:param method: Elected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"]
5656
:param graph: A network as a graph. It could be optional if a Kernel is provided
5757
:param kwargs: Optional arguments:
5858
- k: a kernel [matrix] stemming from a graph, thus sparing the graph transformation process

0 commit comments

Comments
 (0)