GraphGen/examples/generate/generate_omics_qa/omics_qa_config.yaml at 36f17c5ab5d1e637a30097f665a1641fd8e89b8c · InternScience/GraphGen · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
global_params:
  working_dir: cache
  graph_backend: kuzu # graph database backend, support: kuzu, networkx
  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv

nodes:
  - id: read_files
    op_name: read
    type: source
    dependencies: []
    params:
      input_path:
        # three input files to generate DNA, RNA, and Protein data together
        - examples/input_examples/search_dna_demo.jsonl
        - examples/input_examples/search_rna_demo.jsonl
        - examples/input_examples/search_protein_demo.jsonl

  - id: search_data
    op_name: search
    type: map_batch
    dependencies:
      - read_files
    execution_params:
      replicas: 1
      batch_size: 10
    params:
      data_sources: [ncbi, rnacentral, uniprot] # Multi-omics: use all three data sources
      # DNA search parameters
      ncbi_params:
        email: your_email@example.com # Required for NCBI
        tool: GraphGen
        use_local_blast: true
        local_blast_db: path_to_your_local_blast_db/refseq_version/refseq_version
        blast_num_threads: 2
        max_concurrent: 5
      # RNA search parameters
      rnacentral_params:
        use_local_blast: true
        local_blast_db: path_to_your_local_blast_db/rnacentral_YYYYMMDD/rnacentral_YYYYMMDD
        blast_num_threads: 2
        max_concurrent: 5
      # Protein search parameters
      uniprot_params:
        use_local_blast: true
        local_blast_db: path_to_your_local_blast_db/${RELEASE}/uniprot_sprot
        blast_num_threads: 2
        max_concurrent: 5

  - id: chunk_documents
    op_name: chunk
    type: map_batch
    dependencies:
      - search_data
    execution_params:
      replicas: 4
    params:
      chunk_size: 1024 # chunk size for text splitting
      chunk_overlap: 100 # chunk overlap for text splitting
      sequence_chunk_size: 1000 # For sequence chunks (bp for DNA/RNA, aa for protein)
      sequence_chunk_overlap: 100

  - id: build_kg
    op_name: build_kg
    type: map_batch
    dependencies:
      - chunk_documents
    execution_params:
      replicas: 1
      batch_size: 128

  - id: partition
    op_name: partition
    type: aggregate
    dependencies:
      - build_kg
    params:
      method: anchor_bfs # partition method
      method_params:
        anchor_type: [dna, rna, protein] # Multi-omics: support multiple anchor types (list or single string)
        max_units_per_community: 10 # max nodes and edges per community

  - id: generate
    op_name: generate
    type: map_batch
    dependencies:
      - partition
    execution_params:
      replicas: 1
      batch_size: 128
    params:
      method: omics_qa # unified QA generation method for DNA/RNA/Protein
      data_format: ChatML # Alpaca, Sharegpt, ChatML