-
Notifications
You must be signed in to change notification settings - Fork 80
Expand file tree
/
Copy pathomics_qa_config.yaml
More file actions
92 lines (86 loc) · 2.7 KB
/
omics_qa_config.yaml
File metadata and controls
92 lines (86 loc) · 2.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
global_params:
working_dir: cache
graph_backend: kuzu # graph database backend, support: kuzu, networkx
kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
nodes:
- id: read_files
op_name: read
type: source
dependencies: []
params:
input_path:
# three input files to generate DNA, RNA, and Protein data together
- examples/input_examples/search_dna_demo.jsonl
- examples/input_examples/search_rna_demo.jsonl
- examples/input_examples/search_protein_demo.jsonl
- id: search_data
op_name: search
type: map_batch
dependencies:
- read_files
execution_params:
replicas: 1
batch_size: 10
params:
data_sources: [ncbi, rnacentral, uniprot] # Multi-omics: use all three data sources
# DNA search parameters
ncbi_params:
email: your_email@example.com # Required for NCBI
tool: GraphGen
use_local_blast: true
local_blast_db: path_to_your_local_blast_db/refseq_version/refseq_version
blast_num_threads: 2
max_concurrent: 5
# RNA search parameters
rnacentral_params:
use_local_blast: true
local_blast_db: path_to_your_local_blast_db/rnacentral_YYYYMMDD/rnacentral_YYYYMMDD
blast_num_threads: 2
max_concurrent: 5
# Protein search parameters
uniprot_params:
use_local_blast: true
local_blast_db: path_to_your_local_blast_db/${RELEASE}/uniprot_sprot
blast_num_threads: 2
max_concurrent: 5
- id: chunk_documents
op_name: chunk
type: map_batch
dependencies:
- search_data
execution_params:
replicas: 4
params:
chunk_size: 1024 # chunk size for text splitting
chunk_overlap: 100 # chunk overlap for text splitting
sequence_chunk_size: 1000 # For sequence chunks (bp for DNA/RNA, aa for protein)
sequence_chunk_overlap: 100
- id: build_kg
op_name: build_kg
type: map_batch
dependencies:
- chunk_documents
execution_params:
replicas: 1
batch_size: 128
- id: partition
op_name: partition
type: aggregate
dependencies:
- build_kg
params:
method: anchor_bfs # partition method
method_params:
anchor_type: [dna, rna, protein] # Multi-omics: support multiple anchor types (list or single string)
max_units_per_community: 10 # max nodes and edges per community
- id: generate
op_name: generate
type: map_batch
dependencies:
- partition
execution_params:
replicas: 1
batch_size: 128
params:
method: omics_qa # unified QA generation method for DNA/RNA/Protein
data_format: ChatML # Alpaca, Sharegpt, ChatML