2424# - {category}.{number}.genomic.fna.gz (基因组序列)
2525# - {category}.{number}.rna.fna.gz (RNA序列)
2626#
27- # Usage: ./build_dna_blast_db.sh [human_mouse |representative|complete|all]
28- # human_mouse : Download only Homo sapiens and Mus musculus sequences (minimal, smallest)
27+ # Usage: ./build_dna_blast_db.sh [human_mouse_drosophila_yeast |representative|complete|all]
28+ # human_mouse_drosophila_yeast : Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae sequences (minimal, smallest)
2929# representative: Download genomic sequences from major categories (recommended, smaller)
3030# Includes: vertebrate_mammalian, vertebrate_other, bacteria, archaea, fungi
3131# complete: Download all complete genomic sequences from complete/ directory (very large)
3636# For CentOS/RHEL/Fedora: sudo dnf install ncbi-blast+
3737# Or download from: https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/LATEST/
3838
39- DOWNLOAD_TYPE=${1:- human_mouse }
39+ DOWNLOAD_TYPE=${1:- human_mouse_drosophila_yeast }
4040
4141# Better to use a stable DOWNLOAD_TMP name to support resuming downloads
4242DOWNLOAD_TMP=_downloading_dna
5858 echo " Using date as release identifier: ${RELEASE} "
5959fi
6060
61- # Function to check if a file contains target species
62- check_file_for_species () {
63- local url=$1
64- local filename=$2
65- local temp_file=" /tmp/check_${filename// \/ / _} "
61+ # First check if file is already downloaded locally
62+ if check_file_downloaded " ${filename} " ; then
63+ # File already exists, check if it contains target species
64+ # Check both compressed and decompressed versions
65+ local decompressed_file=" ${filename% .gz} "
66+ if [ -f " ${filename} " ]; then
67+ # Compressed file exists
68+ if gunzip -c " ${filename} " 2> /dev/null | head -2000 | grep -qE " (Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)" ; then
69+ return 0 # Contains target species
70+ else
71+ return 1 # Does not contain target species
72+ fi
73+ elif [ -f " ${decompressed_file} " ]; then
74+ # Decompressed file exists
75+ if head -2000 " ${decompressed_file} " 2> /dev/null | grep -qE " (Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)" ; then
76+ return 0 # Contains target species
77+ else
78+ return 1 # Does not contain target species
79+ fi
80+ fi
81+ fi
6682
83+ # File not downloaded yet, download first 500KB to check
6784 # Download first 500KB (enough to get many sequence headers)
6885 # This should be sufficient to identify the species in most cases
6986 if curl -s --max-time 30 --range 0-512000 " ${url} " -o " ${temp_file} " 2> /dev/null && [ -s " ${temp_file} " ]; then
7087 # Try to decompress and check for species names
71- if gunzip -c " ${temp_file} " 2> /dev/null | head -2000 | grep -qE " (Homo sapiens|Mus musculus)" ; then
88+ # Check for: Homo sapiens (人), Mus musculus (小鼠), Drosophila melanogaster (果蝇), Saccharomyces cerevisiae (酵母)
89+ if gunzip -c " ${temp_file} " 2> /dev/null | head -2000 | grep -qE " (Homo sapiens|Mus musculus|Drosophila melanogaster|Saccharomyces cerevisiae)" ; then
7290 rm -f " ${temp_file} "
7391 return 0 # Contains target species
7492 else
@@ -84,92 +102,134 @@ check_file_for_species() {
84102
85103# Download based on type
86104case ${DOWNLOAD_TYPE} in
87- human_mouse)
88- echo " Downloading RefSeq sequences for Homo sapiens and Mus musculus only (minimal size)..."
89- echo " This will check each file to see if it contains human or mouse sequences..."
90- category=" vertebrate_mammalian"
91- echo " Checking files in ${category} category..."
92-
93- # Get list of files and save to temp file to avoid subshell issues
94- curl -s " https://ftp.ncbi.nlm.nih.gov/refseq/release/${category} /" | \
95- grep -oE ' href="[^"]*\.genomic\.fna\.gz"' | \
96- sed ' s/href="\(.*\)"/\1/' > /tmp/refseq_files.txt
105+ human_mouse_drosophila_yeast)
106+ echo " Downloading RefSeq sequences for Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal size)..."
107+ echo " This will check each file to see if it contains target species sequences..."
97108
98- file_count=0
99- download_count=0
109+ # Check multiple categories: vertebrate_mammalian (人、小鼠), invertebrate (果蝇), fungi (酵母)
110+ categories=" vertebrate_mammalian invertebrate fungi"
111+ total_file_count=0
112+ total_download_count=0
100113
101- while read filename; do
102- file_count=$(( file_count + 1 ))
103- url=" https://ftp.ncbi.nlm.nih.gov/refseq/release/${category} /${filename} "
104- echo -n " [${file_count} ] Checking ${filename} ... "
114+ for category in ${categories} ; do
115+ echo " Checking files in ${category} category..."
105116
106- if check_file_for_species " ${url} " " ${filename} " ; then
107- echo " ✓ contains target species, downloading..."
108- download_count=$(( download_count + 1 ))
109- wget -c -q --show-progress " ${url} " || {
110- echo " Warning: Failed to download ${filename} "
111- }
112- else
113- echo " ✗ skipping (no human/mouse data)"
114- fi
115- done < /tmp/refseq_files.txt
117+ # Get list of files and save to temp file to avoid subshell issues
118+ curl -s " https://ftp.ncbi.nlm.nih.gov/refseq/release/${category} /" | \
119+ grep -oE ' href="[^"]*\.genomic\.fna\.gz"' | \
120+ sed ' s/href="\(.*\)"/\1/' > /tmp/refseq_files_${category} .txt
121+
122+ file_count=0
123+ download_count=0
124+
125+ while read filename; do
126+ file_count=$(( file_count + 1 ))
127+ total_file_count=$(( total_file_count + 1 ))
128+ url=" https://ftp.ncbi.nlm.nih.gov/refseq/release/${category} /${filename} "
129+ echo -n " [${total_file_count} ] Checking ${category} /${filename} ... "
130+
131+ if check_file_for_species " ${url} " " ${filename} " ; then
132+ # Check if file is already downloaded
133+ if check_file_downloaded " ${filename} " ; then
134+ echo " ✓ already downloaded (contains target species)"
135+ download_count=$(( download_count + 1 ))
136+ total_download_count=$(( total_download_count + 1 ))
137+ else
138+ echo " ✓ contains target species, downloading..."
139+ download_count=$(( download_count + 1 ))
140+ total_download_count=$(( total_download_count + 1 ))
141+ wget -c -q --show-progress " ${url} " || {
142+ echo " Warning: Failed to download ${filename} "
143+ }
144+ fi
145+ else
146+ echo " ✗ skipping (no target species data)"
147+ fi
148+ done < /tmp/refseq_files_${category} .txt
149+
150+ rm -f /tmp/refseq_files_${category} .txt
151+ echo " ${category} : Checked ${file_count} files, downloaded ${download_count} files."
152+ done
116153
117- rm -f /tmp/refseq_files.txt
118154 echo " "
119- echo " Summary: Checked ${file_count } files, downloaded ${download_count } files containing human or mouse sequences ."
155+ echo " Summary: Checked ${total_file_count } files total , downloaded ${total_download_count } files containing target species (human, mouse, fruit fly, yeast) ."
120156 ;;
121157 representative)
122158 echo " Downloading RefSeq representative sequences (recommended, smaller size)..."
123159 # Download major categories for representative coverage
124160 # Note: You can modify this list based on your specific requirements
125161 for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi; do
126162 echo " Downloading ${category} sequences..."
163+ # Get list of files and save to temp file to avoid subshell issues
127164 curl -s " https://ftp.ncbi.nlm.nih.gov/refseq/release/${category} /" | \
128165 grep -oE ' href="[^"]*\.genomic\.fna\.gz"' | \
129- sed ' s/href="\(.*\)"/\1/' | \
130- while read filename; do
166+ sed ' s/href="\(.*\)"/\1/' > /tmp/refseq_files_${category} .txt
167+
168+ while read filename; do
169+ if check_file_downloaded " ${filename} " ; then
170+ echo " ✓ ${filename} already downloaded, skipping..."
171+ else
131172 echo " Downloading ${filename} ..."
132173 wget -c -q --show-progress \
133174 " https://ftp.ncbi.nlm.nih.gov/refseq/release/${category} /${filename} " || {
134175 echo " Warning: Failed to download ${filename} "
135176 }
136- done
177+ fi
178+ done < /tmp/refseq_files_${category} .txt
179+
180+ rm -f /tmp/refseq_files_${category} .txt
137181 done
138182 ;;
139183 complete)
140184 echo " Downloading RefSeq complete genomic sequences (WARNING: very large, may take hours)..."
185+ # Get list of files and save to temp file to avoid subshell issues
141186 curl -s " https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/" | \
142187 grep -oE ' href="[^"]*\.genomic\.fna\.gz"' | \
143- sed ' s/href="\(.*\)"/\1/' | \
144- while read filename; do
188+ sed ' s/href="\(.*\)"/\1/' > /tmp/refseq_files_complete.txt
189+
190+ while read filename; do
191+ if check_file_downloaded " ${filename} " ; then
192+ echo " ✓ ${filename} already downloaded, skipping..."
193+ else
145194 echo " Downloading ${filename} ..."
146195 wget -c -q --show-progress \
147196 " https://ftp.ncbi.nlm.nih.gov/refseq/release/complete/${filename} " || {
148197 echo " Warning: Failed to download ${filename} "
149198 }
150- done
199+ fi
200+ done < /tmp/refseq_files_complete.txt
201+
202+ rm -f /tmp/refseq_files_complete.txt
151203 ;;
152204 all)
153205 echo " Downloading all RefSeq genomic sequences from all categories (WARNING: extremely large, may take many hours)..."
154206 # Download genomic sequences from all categories
155207 for category in vertebrate_mammalian vertebrate_other bacteria archaea fungi invertebrate plant viral protozoa mitochondrion plastid plasmid other; do
156208 echo " Downloading ${category} genomic sequences..."
209+ # Get list of files and save to temp file to avoid subshell issues
157210 curl -s " https://ftp.ncbi.nlm.nih.gov/refseq/release/${category} /" | \
158211 grep -oE ' href="[^"]*\.genomic\.fna\.gz"' | \
159- sed ' s/href="\(.*\)"/\1/' | \
160- while read filename; do
212+ sed ' s/href="\(.*\)"/\1/' > /tmp/refseq_files_${category} .txt
213+
214+ while read filename; do
215+ if check_file_downloaded " ${filename} " ; then
216+ echo " ✓ ${filename} already downloaded, skipping..."
217+ else
161218 echo " Downloading ${filename} ..."
162219 wget -c -q --show-progress \
163220 " https://ftp.ncbi.nlm.nih.gov/refseq/release/${category} /${filename} " || {
164221 echo " Warning: Failed to download ${filename} "
165222 }
166- done
223+ fi
224+ done < /tmp/refseq_files_${category} .txt
225+
226+ rm -f /tmp/refseq_files_${category} .txt
167227 done
168228 ;;
169229 * )
170230 echo " Error: Unknown download type '${DOWNLOAD_TYPE} '"
171- echo " Usage: $0 [human_mouse |representative|complete|all]"
172- echo " human_mouse : Download only Homo sapiens and Mus musculus (minimal)"
231+ echo " Usage: $0 [human_mouse_drosophila_yeast |representative|complete|all]"
232+ echo " human_mouse_drosophila_yeast : Download only Homo sapiens, Mus musculus, Drosophila melanogaster, and Saccharomyces cerevisiae (minimal)"
173233 echo " representative: Download major categories (recommended)"
174234 echo " complete: Download all complete genomic sequences (very large)"
175235 echo " all: Download all genomic sequences (extremely large)"
0 commit comments