This guide gives exact commands to set up and run the project end-to-end.
Install OS packages (Ubuntu/Debian):
sudo apt update
sudo apt install -y python3 python3-pip python3-venv openjdk-17-jdk docker.io docker-compose-plugin makeEnable Docker for your user:
sudo systemctl enable --now docker
sudo usermod -aG docker "$USER"
newgrp dockercd ~
git clone <YOUR_REPO_URL> HIFUN-Router-clone
cd HIFUN-Router-cloneIf the repository is already present:
cd ~/HIFUN-Router-clonecd ~/HIFUN-Router-clone
python3 -m venv .venv
source .venv/bin/activate
python -m pip install --upgrade pip wheel setuptools
pip install -r requirements.txtSet runtime environment variables for Spark + Python imports:
export PYTHONPATH="$PWD"
export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
export PATH="$JAVA_HOME/bin:$PATH"
mkdir -p /tmp/spark-events
export HIFUN_HISTORY_SERVER=/tmp/spark-eventsVerify environment:
python test_setup.pyUse this if you want to run immediately without regenerating datasets:
cd ~/HIFUN-Router-clone
source .venv/bin/activate
export PYTHONPATH="$PWD"
export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
export PATH="$JAVA_HOME/bin:$PATH"
mkdir -p /tmp/spark-events
export HIFUN_HISTORY_SERVER=/tmp/spark-events
python -m model.trainer
python -m experiments.relevance_evaluation
python -m experiments.dataset_shift_evaluation --source training_data/real_labeled_runs.csv
pytest -qSNB raw generation via Docker datagen:
mkdir -p data/raw/ldbc_snb
docker run --rm \
--mount type=bind,source="$(pwd)/data/raw/ldbc_snb",target=/out \
ldbc/datagen-standalone:latest \
--parallelism 1 -- --format csv --scale-factor 1 --mode raw --output-dir /outConvert SNB raw CSV to parquet + graph:
python -m data.scripts.ldbc_snb_to_parquet --input data/raw/ldbc_snb --parquet-dir data/parquet/snb --graph-dir data/graphs/snbDownload + convert OGB graph dataset:
python -m data.scripts.ogb_to_parquet --dataset ogbn-arxiv --root data/raw/ogb --graph-dir data/graphsConvert JOB/IMDB dumps (if available in data/raw/job):
python -m data.scripts.job_to_parquet --input data/raw/job --output data/parquet/jobConvert TPC-DS dsdgen output (if available in data/raw/tpcds):
python -m data.scripts.tpcds_to_parquet --input data/raw/tpcds --output data/parquet/tpcdspython -m data.scripts.compute_statspython -m training_data.real_query_generator --scale aggressive --focus-mode allOptional class-diversity focused generation:
python -m training_data.real_query_generator --scale balanced --focus-mode graph_win
python -m training_data.real_query_generator --scale balanced --focus-mode sql_winStandard run:
python -m training_data.real_collection_script \
--queries_dir dsl/sample_queries \
--output training_data/real_labeled_runs.csv \
--n_warmup 2 --n_measure 3 --repeat 3Strict real-only run with failure taxonomy report:
python -m training_data.real_collection_script \
--queries_dir dsl/sample_queries \
--output training_data/real_labeled_runs_strict.csv \
--n_warmup 2 --n_measure 3 --repeat 3 \
--strict_real_only \
--failure_report training_data/real_collection_failures.jsonpython -m training_data.fix_dataset_splits --source training_data/real_labeled_runs.csvpython -m model.trainerRelevance evaluation:
python -m experiments.relevance_evaluationDataset-shift evaluation:
python -m experiments.dataset_shift_evaluation --source training_data/real_labeled_runs.csvExpected outputs:
ls -lh experiments/results/relevance_eval.json experiments/results/relevance_eval.md
ls -lh experiments/results/dataset_shift_eval.json experiments/results/dataset_shift_eval.mdRun all tests:
pytest -qRun correctness-focused test suites:
pytest tests/test_execution.py -v
pytest tests/test_correctness.py -v
pytest tests/test_experiments.py -vRun one sample query through the router using Makefile helper:
make run-query QUERY=dsl/sample_queries/tpch_queries.jsonRun complete synthetic query pack:
make run-syntheticStart all services:
docker compose up -d
docker compose psWatch Spark master logs:
docker compose logs -f spark-masterStop services:
docker compose downUseful UIs:
- Spark Master: http://localhost:8080
- Spark Worker 1: http://localhost:8081
- Spark Worker 2: http://localhost:8082
- Spark History Server: http://localhost:18080
- NameNode: http://localhost:9870
- YARN Resource Manager: http://localhost:8088
Show all commands:
make helpCommon shortcuts:
make setup
make test-env
make data-all
make train
make evaluate
make test-allmake cleanIf you see JAVA_HOME is not set:
export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
export PATH="$JAVA_HOME/bin:$PATH"If you see ModuleNotFoundError: No module named 'config':
cd ~/HIFUN-Router-clone
export PYTHONPATH="$PWD"If Spark fails while downloading packages on first run, retry:
python test_setup.py