Skip to content

Commit fc18746

Browse files
committed
feat: multi omics KG building
1 parent 02adac3 commit fc18746

47 files changed

Lines changed: 1350 additions & 835 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

examples/search/build_db/build_dna_blast_db.sh

Lines changed: 108 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ set -e
2424
# - {category}.{number}.genomic.fna.gz (基因组序列)
2525
# - {category}.{number}.rna.fna.gz (RNA序列)
2626
#
27-
# Usage: ./build_dna_blast_db.sh [human_mouse|representative|complete|all]
28-
# human_mouse: Download only Homo sapiens and Mus musculus sequences (minimal, smallest)
27+
# Usage: ./build_dna_blast_db.sh [human_mouse_drosophila_yeast|representative|complete|all]
28+
# human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae sequences (minimal, smallest)
2929
# representative: Download genomic sequences from major categories (recommended, smaller)
3030
# Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi
3131
# complete: Download all complete genomic sequences from complete/ directory (very large)
@@ -36,7 +36,7 @@ set -e
3636
# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
3737
# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
3838

39-
DOWNLOAD_TYPE=${1:-human_mouse}
39+
DOWNLOAD_TYPE=${1:-human_mouse_drosophila_yeast}
4040

4141
# Better to use a stable DOWNLOAD_TMP name to support resuming downloads
4242
DOWNLOAD_TMP=_downloading_dna
@@ -58,17 +58,35 @@ else
5858
echo "Using date as release identifier: ${RELEASE}"
5959
fi
6060

61-
# Function to check if a file contains target species
62-
check_file_for_species() {
63-
local url=$1
64-
local filename=$2
65-
local temp_file="/tmp/check_${filename//\//_}"
61+
# First check if file is already downloaded locally
62+
if check_file_downloaded "${filename}"; then
63+
# File already exists, check if it contains target species
64+
# Check both compressed and decompressed versions
65+
local decompressed_file="${filename%.gz}"
66+
if [ -f "${filename}" ]; then
67+
# Compressed file exists
68+
if gunzip -c "${filename}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then
69+
return 0 # Contains target species
70+
else
71+
return 1 # Does not contain target species
72+
fi
73+
elif [ -f "${decompressed_file}" ]; then
74+
# Decompressed file exists
75+
if head -2000 "${decompressed_file}" 2>/dev/null | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then
76+
return 0 # Contains target species
77+
else
78+
return 1 # Does not contain target species
79+
fi
80+
fi
81+
fi
6682

83+
# File not downloaded yet, download first 500KB to check
6784
# Download first 500KB (enough to get many sequence headers)
6885
# This should be sufficient to identify the species in most cases
6986
if curl -s --max-time 30 --range 0-512000 "${url}" -o "${temp_file}" 2>/dev/null && [ -s "${temp_file}" ]; then
7087
# Try to decompress and check for species names
71-
if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus)"; then
88+
# Check for: Homo sapiens (人), Mus musculus (小鼠), Drosophila melanogaster (果蝇), Saccharomyces cerevisiae (酵母)
89+
if gunzip -c "${temp_file}" 2>/dev/null | head -2000 | grep -qE "(Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)"; then
7290
rm -f "${temp_file}"
7391
return 0 # Contains target species
7492
else
@@ -84,92 +102,134 @@ check_file_for_species() {
84102

85103
# Download based on type
86104
case ${DOWNLOAD_TYPE} in
87-
human_mouse)
88-
echo "Downloading RefSeq sequences for Homo sapiens and Mus musculus only (minimal size)..."
89-
echo "This will check each file to see if it contains human or mouse sequences..."
90-
category="vertebrate_mammalian"
91-
echo "Checking files in ${category} category..."
92-
93-
# Get list of files and save to temp file to avoid subshell issues
94-
curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
95-
grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
96-
sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files.txt
105+
human_mouse_drosophila_yeast)
106+
echo "Downloading RefSeq sequences for Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal size)..."
107+
echo "This will check each file to see if it contains target species sequences..."
97108

98-
file_count=0
99-
download_count=0
109+
# Check multiple categories: vertebrate_mammalian (人、小鼠), invertebrate (果蝇), fungi (酵母)
110+
categories="vertebrate_mammalian invertebrate fungi"
111+
total_file_count=0
112+
total_download_count=0
100113

101-
while read filename; do
102-
file_count=$((file_count + 1))
103-
url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}"
104-
echo -n "[${file_count}] Checking ${filename}... "
114+
for category in ${categories}; do
115+
echo "Checking files in ${category} category..."
105116

106-
if check_file_for_species "${url}" "${filename}"; then
107-
echo "✓ contains target species, downloading..."
108-
download_count=$((download_count + 1))
109-
wget -c -q --show-progress "${url}" || {
110-
echo "Warning: Failed to download ${filename}"
111-
}
112-
else
113-
echo "✗ skipping (no human/mouse data)"
114-
fi
115-
done < /tmp/refseq_files.txt
117+
# Get list of files and save to temp file to avoid subshell issues
118+
curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
119+
grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
120+
sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt
121+
122+
file_count=0
123+
download_count=0
124+
125+
while read filename; do
126+
file_count=$((file_count + 1))
127+
total_file_count=$((total_file_count + 1))
128+
url="https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}"
129+
echo -n "[${total_file_count}] Checking ${category}/${filename}... "
130+
131+
if check_file_for_species "${url}" "${filename}"; then
132+
# Check if file is already downloaded
133+
if check_file_downloaded "${filename}"; then
134+
echo "✓ already downloaded (contains target species)"
135+
download_count=$((download_count + 1))
136+
total_download_count=$((total_download_count + 1))
137+
else
138+
echo "✓ contains target species, downloading..."
139+
download_count=$((download_count + 1))
140+
total_download_count=$((total_download_count + 1))
141+
wget -c -q --show-progress "${url}" || {
142+
echo "Warning: Failed to download ${filename}"
143+
}
144+
fi
145+
else
146+
echo "✗ skipping (no target species data)"
147+
fi
148+
done < /tmp/refseq_files_${category}.txt
149+
150+
rm -f /tmp/refseq_files_${category}.txt
151+
echo " ${category}: Checked ${file_count} files, downloaded ${download_count} files."
152+
done
116153

117-
rm -f /tmp/refseq_files.txt
118154
echo ""
119-
echo "Summary: Checked ${file_count} files, downloaded ${download_count} files containing human or mouse sequences."
155+
echo "Summary: Checked ${total_file_count} files total, downloaded ${total_download_count} files containing target species (human, mouse, fruit fly, yeast)."
120156
;;
121157
representative)
122158
echo "Downloading RefSeq representative sequences (recommended, smaller size)..."
123159
# Download major categories for representative coverage
124160
# Note: You can modify this list based on your specific requirements
125161
for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi; do
126162
echo "Downloading ${category} sequences..."
163+
# Get list of files and save to temp file to avoid subshell issues
127164
curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
128165
grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
129-
sed 's/href="\(.*\)"/\1/' | \
130-
while read filename; do
166+
sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt
167+
168+
while read filename; do
169+
if check_file_downloaded "${filename}"; then
170+
echo "${filename} already downloaded, skipping..."
171+
else
131172
echo " Downloading ${filename}..."
132173
wget -c -q --show-progress \
133174
"https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || {
134175
echo "Warning: Failed to download ${filename}"
135176
}
136-
done
177+
fi
178+
done < /tmp/refseq_files_${category}.txt
179+
180+
rm -f /tmp/refseq_files_${category}.txt
137181
done
138182
;;
139183
complete)
140184
echo "Downloading RefSeq complete genomic sequences (WARNING: very large, may take hours)..."
185+
# Get list of files and save to temp file to avoid subshell issues
141186
curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \
142187
grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
143-
sed 's/href="\(.*\)"/\1/' | \
144-
while read filename; do
188+
sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_complete.txt
189+
190+
while read filename; do
191+
if check_file_downloaded "${filename}"; then
192+
echo "${filename} already downloaded, skipping..."
193+
else
145194
echo " Downloading ${filename}..."
146195
wget -c -q --show-progress \
147196
"https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename}" || {
148197
echo "Warning: Failed to download ${filename}"
149198
}
150-
done
199+
fi
200+
done < /tmp/refseq_files_complete.txt
201+
202+
rm -f /tmp/refseq_files_complete.txt
151203
;;
152204
all)
153205
echo "Downloading all RefSeq genomic sequences from all categories (WARNING: extremely large, may take many hours)..."
154206
# Download genomic sequences from all categories
155207
for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do
156208
echo "Downloading ${category} genomic sequences..."
209+
# Get list of files and save to temp file to avoid subshell issues
157210
curl -s "https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/" | \
158211
grep -oE 'href="[^"]*\.genomic\.fna\.gz"' | \
159-
sed 's/href="\(.*\)"/\1/' | \
160-
while read filename; do
212+
sed 's/href="\(.*\)"/\1/' > /tmp/refseq_files_${category}.txt
213+
214+
while read filename; do
215+
if check_file_downloaded "${filename}"; then
216+
echo "${filename} already downloaded, skipping..."
217+
else
161218
echo " Downloading ${filename}..."
162219
wget -c -q --show-progress \
163220
"https://ftp.ncbi.nlm.nih.gov/refseq/release/${category}/${filename}" || {
164221
echo "Warning: Failed to download ${filename}"
165222
}
166-
done
223+
fi
224+
done < /tmp/refseq_files_${category}.txt
225+
226+
rm -f /tmp/refseq_files_${category}.txt
167227
done
168228
;;
169229
*)
170230
echo "Error: Unknown download type '${DOWNLOAD_TYPE}'"
171-
echo "Usage: $0 [human_mouse|representative|complete|all]"
172-
echo " human_mouse: Download only Homo sapiens and Mus musculus (minimal)"
231+
echo "Usage: $0 [human_mouse_drosophila_yeast|representative|complete|all]"
232+
echo " human_mouse_drosophila_yeast: Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal)"
173233
echo " representative: Download major categories (recommended)"
174234
echo " complete: Download all complete genomic sequences (very large)"
175235
echo " all: Download all genomic sequences (extremely large)"

examples/search/build_db/build_protein_blast_db.sh

Lines changed: 52 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,48 +9,78 @@ set -e
99
# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
1010
# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
1111

12-
# Better to use a stable DOWNLOAD_TMP name to support resuming downloads
13-
DOWNLOAD_TMP=_downloading
14-
mkdir -p ${DOWNLOAD_TMP}
15-
cd ${DOWNLOAD_TMP}
16-
17-
wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/RELEASE.metalink"
12+
echo "Downloading RELEASE.metalink..."
13+
wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/RELEASE.metalink"
1814

1915
# Extract the release name (like 2017_10 or 2017_1)
2016
# Use sed for cross-platform compatibility (works on both macOS and Linux)
2117
RELEASE=$(sed -n 's/.*<version>\([0-9]\{4\}_[0-9]\{1,2\}\)<\/version>.*/\1/p' RELEASE.metalink | head -1)
2218

23-
wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
24-
wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz"
25-
wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/reldate.txt"
26-
wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/README"
27-
wget -c "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/LICENSE"
19+
echo "UniProt release: ${RELEASE}"
20+
echo ""
21+
22+
# Download Swiss-Prot (always needed)
23+
echo "Downloading uniprot_sprot.fasta.gz..."
24+
wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz"
25+
26+
# Download TrEMBL only if full mode
27+
if [ "${DOWNLOAD_MODE}" = "full" ]; then
28+
echo "Downloading uniprot_trembl.fasta.gz..."
29+
wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz"
30+
fi
31+
32+
# Download metadata files
33+
echo "Downloading metadata files..."
34+
wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/reldate.txt"
35+
wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/README"
36+
wget -c "${UNIPROT_BASE}/current_release/knowledgebase/complete/LICENSE"
2837

2938
cd ..
3039

31-
mkdir ${RELEASE}
40+
mkdir -p ${RELEASE}
3241
mv ${DOWNLOAD_TMP}/* ${RELEASE}
3342
rmdir ${DOWNLOAD_TMP}
3443

3544
cd ${RELEASE}
3645

46+
echo ""
47+
echo "Extracting files..."
3748
gunzip uniprot_sprot.fasta.gz
38-
gunzip uniprot_trembl.fasta.gz
3949

40-
cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta
50+
if [ "${DOWNLOAD_MODE}" = "full" ]; then
51+
gunzip uniprot_trembl.fasta.gz
52+
echo "Merging Swiss-Prot and TrEMBL..."
53+
cat uniprot_sprot.fasta uniprot_trembl.fasta >uniprot_${RELEASE}.fasta
54+
fi
4155

42-
makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE}
56+
echo ""
57+
echo "Building BLAST databases..."
58+
59+
# Always build Swiss-Prot database
4360
makeblastdb -in uniprot_sprot.fasta -out uniprot_sprot -dbtype prot -parse_seqids -title uniprot_sprot
44-
makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl
61+
62+
# Build full release database only if in full mode
63+
if [ "${DOWNLOAD_MODE}" = "full" ]; then
64+
makeblastdb -in uniprot_${RELEASE}.fasta -out uniprot_${RELEASE} -dbtype prot -parse_seqids -title uniprot_${RELEASE}
65+
makeblastdb -in uniprot_trembl.fasta -out uniprot_trembl -dbtype prot -parse_seqids -title uniprot_trembl
66+
fi
4567

4668
cd ..
4769

70+
echo ""
4871
echo "BLAST databases created successfully!"
4972
echo "Database locations:"
50-
echo " - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}"
51-
echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot"
52-
echo " - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl"
53-
echo ""
54-
echo "To use these databases, set in your config:"
55-
echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot # or uniprot_${RELEASE} or uniprot_trembl"
73+
if [ "${DOWNLOAD_MODE}" = "sprot" ]; then
74+
echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot"
75+
echo ""
76+
echo "To use this database, set in your config:"
77+
echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot"
78+
else
79+
echo " - Combined: $(pwd)/${RELEASE}/uniprot_${RELEASE}"
80+
echo " - Swiss-Prot: $(pwd)/${RELEASE}/uniprot_sprot"
81+
echo " - TrEMBL: $(pwd)/${RELEASE}/uniprot_trembl"
82+
echo ""
83+
echo "To use these databases, set in your config:"
84+
echo " local_blast_db: $(pwd)/${RELEASE}/uniprot_sprot # or uniprot_${RELEASE} or uniprot_trembl"
85+
fi
5686

0 commit comments

Comments
 (0)