-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
256 lines (199 loc) · 15.5 KB
/
Makefile
File metadata and controls
256 lines (199 loc) · 15.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
.PHONY: help setup data-tpch data-tpch-duckdb data-snb data-synthetic data-stats data-graphs-real data-all validate-queries test clean clean-local-results collect-data collect-coverage-boost collect-coverage-boost-iter train analyze report quality-gate quality-gate-strict quality-gate-coverage-all build-all-validated-data publish-eval-all publish-eval-all-coverage publish-eval publish-eval-strict publish-gate publish-gate-native status-snapshot data-tpch-check
PYTHON := python3
SPARK_SUBMIT := spark-submit
VENV := source/bin/activate
help: ## Show this help message
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | \
awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}'
# --- Phase 1: Setup ---
setup: ## Install Python dependencies
pip install -r requirements.txt
test-env: ## Verify Spark + GraphFrames environment
$(PYTHON) test_setup.py
# --- Phase 2: Data Preparation ---
data-tpch: ## Convert TPC-H raw .tbl files to Parquet
$(PYTHON) data/scripts/tpch_to_parquet.py \
--input data/raw/tpch-kit/dbgen \
--output data/parquet/tpch/
data-tpch-duckdb: ## Generate TPCH customer/orders parquet via DuckDB TPCH extension
$(PYTHON) data/scripts/tpch_duckdb_to_parquet.py --output data/parquet/tpch --sf 1
data-tpch-check: ## Verify required TPCH .tbl inputs exist
@test -f data/raw/tpch-kit/dbgen/customer.tbl || (echo "Missing data/raw/tpch-kit/dbgen/customer.tbl" && exit 1)
@test -f data/raw/tpch-kit/dbgen/orders.tbl || (echo "Missing data/raw/tpch-kit/dbgen/orders.tbl" && exit 1)
@echo "TPCH raw input files found."
data-snb: ## Ingest LDBC SNB data (or generate synthetic SNB if raw unavailable)
$(PYTHON) data/scripts/snb_to_parquet.py \
--input data/raw/snb \
--tables data/parquet/snb/ \
--edges data/graphs/snb_edges.parquet
data-synthetic: ## Generate synthetic power-law graphs (default: avg_degree=10 only)
$(PYTHON) data/scripts/generate_synthetic.py --default-only
data-synthetic-all: ## Generate synthetic graphs for all degree variants [2,5,10,20,50]
$(PYTHON) data/scripts/generate_synthetic.py --degrees 2,5,10,20,50
data-stats: ## Precompute table and graph statistics
$(PYTHON) data/scripts/compute_stats.py
data-graphs-real: ## Build JOB/TPCDS graph projections for graph-win collection
$(PYTHON) data/scripts/build_dataset_graphs.py
data-all: data-tpch data-snb data-synthetic data-stats ## Run full data preparation pipeline
@echo "All data preparation complete."
validate-queries: ## Validate all sample DSL queries against schema
$(PYTHON) -c "\
import json, sys; \
sys.path.insert(0, '.'); \
from dsl.validator import validate_query_file; \
files = ['dsl/sample_queries/tpch_queries.json', 'dsl/sample_queries/snb_queries.json', 'dsl/sample_queries/synthetic_queries.json']; \
ok = True; \
[print(f'Validating {f}...') or (lambda r: (print(f' ✅ Valid') if r['valid'] else (print(f' ❌ Errors: {r[\"errors\"]}'), setattr(sys.modules[__name__], '_ok', False))))(validate_query_file(f)) for f in files]; \
print('All queries validated.' if ok else 'Some queries have errors.')"
# --- Phase 3+ (placeholders) ---
test: ## Run all tests
$(PYTHON) -m pytest tests/ -v
# --- Phase 4: ML Pipeline ---
collect-data: ## Generate labeled training data from DSL queries
$(PYTHON) -m training_data.collection_script
collect-coverage-boost: data-graphs-real ## Targeted real collection to improve per-dataset GRAPH coverage
$(PYTHON) -m training_data.real_query_generator --scale aggressive --focus-mode graph_win --include-sql-families-in-graph-focus
$(PYTHON) -m training_data.real_collection_script --queries_dir dsl/sample_queries --output training_data/real_labeled_runs_coverage_boost.csv --n_warmup 1 --n_measure 2 --repeat 1 --no_penalize_sql_traversal_approx
collect-coverage-boost-iter: collect-coverage-boost ## Merge boosted labels into strict-all and run coverage gate preview
$(PYTHON) -m training_data.merge_and_dedup_labels --base training_data/real_labeled_runs_strict_all.csv --extra training_data/real_labeled_runs_coverage_boost.csv --output training_data/real_labeled_runs_strict_all_plus_boost_preview.csv --summary_out training_data/real_labeled_runs_strict_all_plus_boost_preview_summary.json
$(PYTHON) -m training_data.fix_dataset_splits --source training_data/real_labeled_runs_strict_all_plus_boost_preview.csv --split_mode group --group_col query_id --train_base_out training_data/fixed_train_base_strict_all_plus_boost_preview.csv --eval_out training_data/fixed_eval_set_strict_all_plus_boost_preview.csv --graph_eval_out training_data/fixed_eval_graph_only_strict_all_plus_boost_preview.csv --train_balanced_out training_data/fixed_train_balanced_strict_all_plus_boost_preview.csv --manifest_out training_data/fixed_split_manifest_strict_all_plus_boost_preview.json
$(PYTHON) -m training_data.dataset_quality_gate --source training_data/real_labeled_runs_strict_all_plus_boost_preview.csv --train training_data/fixed_train_base_strict_all_plus_boost_preview.csv --eval training_data/fixed_eval_set_strict_all_plus_boost_preview.csv --per_dataset_min_graph "snb_real_queries:25,ogb_real_queries:25,snb_bi_real_queries:25,job_real_queries:25,tpcds_real_queries:25" --require_dataset_presence --out_json training_data/dataset_quality_report_strict_all_plus_boost_preview_coverage.json
train: collect-data ## Train ML classifier (Decision Tree + XGBoost)
$(PYTHON) -m model.trainer
analyze: train ## Run SHAP analysis and feature importance
$(PYTHON) -m model.feature_importance
report: ## Open project report notebook
jupyter notebook notebooks/project_report.ipynb
# --- Phase 5: Execution Engine Integration ---
test-execution: ## Run execution engine unit tests
$(PYTHON) -m pytest tests/test_execution.py -v
test-correctness: ## Run correctness tests (HybridRouter vs ReferenceExecutor)
$(PYTHON) -m pytest tests/test_correctness.py -v
run-query: ## Run a single HIFUN query file via HybridRouter (QUERY=path/to/query.json)
$(PYTHON) -c "\
import json, sys; \
sys.path.insert(0, '.'); \
from router.hybrid_router import HybridRouter; \
router = HybridRouter(); \
with open('$(QUERY)') as f: queries = json.load(f); \
q = queries[0] if isinstance(queries, list) else queries; \
result = router.execute_query(q); \
print('Routing decisions:'); \
[print(f' {d[\"sub_id\"]}: {d[\"engine\"]} (conf={d[\"confidence\"]:.3f})') for d in result['routing_decisions']]; \
print(f'Total time: {result[\"total_time_ms\"]:.1f}ms'); \
print(f'Result shape: {result[\"result\"].shape}'); \
print(result['result'].head())"
run-tpch: ## Run all TPC-H queries via HybridRouter
$(PYTHON) -c "\
import json, sys; \
sys.path.insert(0, '.'); \
from router.hybrid_router import HybridRouter; \
router = HybridRouter(); \
with open('dsl/sample_queries/tpch_queries.json') as f: queries = json.load(f); \
for q in queries: \
r = router.execute_query(q); \
engines = [d['engine'] for d in r['routing_decisions']]; \
print(f\"{q['query_id']}: engines={engines}, time={r['total_time_ms']:.1f}ms, rows={len(r['result'])}\")"
run-synthetic: ## Run all synthetic graph queries via HybridRouter
$(PYTHON) -c "\
import json, sys; \
sys.path.insert(0, '.'); \
from router.hybrid_router import HybridRouter; \
router = HybridRouter(); \
with open('dsl/sample_queries/synthetic_queries.json') as f: queries = json.load(f); \
for q in queries: \
r = router.execute_query(q); \
engines = [d['engine'] for d in r['routing_decisions']]; \
print(f\"{q['query_id']}: engines={engines}, time={r['total_time_ms']:.1f}ms, rows={len(r['result'])}\")"
pipeline: data-all collect-data train analyze ## Run full pipeline: data → train → analyze
@echo "Full pipeline complete."
test-all: test test-execution test-correctness test-experiments ## Run ALL tests (phase 1-6)
@echo "All tests passed."
# --- Phase 6: Evaluation & Experiments ---
test-experiments: ## Run Phase 6 experiment tests
$(PYTHON) -m pytest tests/test_experiments.py -v
baseline-sql: ## Run Always-SQL baseline
$(PYTHON) -m experiments.run_baselines --strategy always_sql
baseline-graph: ## Run Always-Graph baseline
$(PYTHON) -m experiments.run_baselines --strategy always_graph
baseline-rule: ## Run Rule-Based baseline
$(PYTHON) -m experiments.run_baselines --strategy rule_based
baselines: baseline-sql baseline-graph baseline-rule ## Run all baselines
@echo "All baselines complete."
learned: ## Run ML-routed (learned) strategy
$(PYTHON) -m experiments.run_learned
compare: ## Compare all strategies and produce comparison table
$(PYTHON) -m experiments.compare_results
ablation: ## Run feature ablation study (repeated CV stability)
$(PYTHON) -m experiments.ablation_study
evaluate: baselines learned compare ablation ## Run full Phase 6 evaluation pipeline
@echo "Full evaluation pipeline complete."
quality-gate: ## Block invalid/degenerate datasets before reporting results
$(PYTHON) -m training_data.dataset_quality_gate \
--source training_data/real_labeled_runs.csv \
--train training_data/fixed_train_base.csv \
--eval training_data/fixed_eval_set.csv
quality-gate-strict: ## Strict quality gate on curated real-measurement artifacts
$(PYTHON) -m training_data.dataset_quality_gate \
--source training_data/real_labeled_runs_strict_curated.csv \
--train training_data/fixed_train_base_strict.csv \
--eval training_data/fixed_eval_set_strict.csv \
--out_json training_data/dataset_quality_report_strict_runtime.json
build-all-validated-data: ## Build strict all-data source by merging strict curated + other real-measured datasets
$(PYTHON) -m training_data.build_validated_all_source \
--strict_source training_data/real_labeled_runs_strict_curated.csv \
--full_source training_data/real_labeled_runs.csv \
--output training_data/real_labeled_runs_strict_all.csv \
--summary_out training_data/real_labeled_runs_strict_all_summary.json
publish-eval-all: build-all-validated-data ## Run strict eval bundle on all validated real datasets
$(PYTHON) -m training_data.fix_dataset_splits --source training_data/real_labeled_runs_strict_all.csv --split_mode group --group_col query_id --train_base_out training_data/fixed_train_base_strict_all.csv --eval_out training_data/fixed_eval_set_strict_all.csv --graph_eval_out training_data/fixed_eval_graph_only_strict_all.csv --train_balanced_out training_data/fixed_train_balanced_strict_all.csv --manifest_out training_data/fixed_split_manifest_strict_all.json
$(PYTHON) -m training_data.dataset_quality_gate --source training_data/real_labeled_runs_strict_all.csv --train training_data/fixed_train_base_strict_all.csv --eval training_data/fixed_eval_set_strict_all.csv --out_json training_data/dataset_quality_report_strict_all_runtime.json
$(PYTHON) -m model.trainer --data training_data/fixed_train_base_strict_all.csv
$(PYTHON) -m experiments.relevance_evaluation --train training_data/fixed_train_base_strict_all.csv --eval training_data/fixed_eval_set_strict_all.csv --out_json experiments/results/relevance_eval_strict_all_runtime.json --out_md experiments/results/relevance_eval_strict_all_runtime.md
quality-gate-coverage-all: ## Enforce per-dataset GRAPH minima for broad multi-dataset claims
$(PYTHON) -m training_data.dataset_quality_gate \
--source training_data/real_labeled_runs_strict_all.csv \
--train training_data/fixed_train_base_strict_all.csv \
--eval training_data/fixed_eval_set_strict_all.csv \
--per_dataset_min_graph "snb_real_queries:25,ogb_real_queries:25,snb_bi_real_queries:25,job_real_queries:25,tpcds_real_queries:25" \
--require_dataset_presence \
--out_json training_data/dataset_quality_report_strict_all_coverage.json
publish-eval-all-coverage: publish-eval-all ## All-data eval plus strict per-dataset coverage and improved ablation protocol
$(PYTHON) -m experiments.ablation_study --data training_data/fixed_train_base_strict_all.csv --output experiments/results/ablation_strict_all_runtime.csv --model xgboost --group-col query_id --by-dataset
$(MAKE) quality-gate-coverage-all
publish-eval: ## Run publishable strict evaluation bundle
$(PYTHON) -m training_data.fix_dataset_splits --source training_data/real_labeled_runs_strict_curated.csv --split_mode group --group_col query_id --train_base_out training_data/fixed_train_base_strict.csv --eval_out training_data/fixed_eval_set_strict.csv --graph_eval_out training_data/fixed_eval_graph_only_strict.csv --train_balanced_out training_data/fixed_train_balanced_strict.csv --manifest_out training_data/fixed_split_manifest_strict.json
$(PYTHON) -m training_data.dataset_quality_gate --source training_data/real_labeled_runs_strict_curated.csv --train training_data/fixed_train_base_strict.csv --eval training_data/fixed_eval_set_strict.csv --out_json training_data/dataset_quality_report_strict_runtime.json
$(PYTHON) -m model.trainer --data training_data/fixed_train_base_strict.csv
$(PYTHON) -m experiments.relevance_evaluation --train training_data/fixed_train_base_strict.csv --eval training_data/fixed_eval_set_strict.csv --out_json experiments/results/relevance_eval_strict_runtime.json --out_md experiments/results/relevance_eval_strict_runtime.md
$(PYTHON) -m experiments.ablation_study --data training_data/fixed_train_base_strict.csv --output experiments/results/ablation_strict_runtime.csv
$(PYTHON) -m experiments.dataset_shift_evaluation --source training_data/real_labeled_runs_strict_curated.csv --out_json experiments/results/dataset_shift_eval_strict_runtime.json --out_md experiments/results/dataset_shift_eval_strict_runtime.md
$(PYTHON) -m experiments.strict_robustness_evaluation --train training_data/fixed_train_base_strict.csv --eval training_data/fixed_eval_set_strict.csv --transfer_source training_data/real_labeled_runs_strict_curated.csv --out_json experiments/results/strict_robustness_eval_runtime.json --out_md experiments/results/strict_robustness_eval_runtime.md
$(PYTHON) -m experiments.correctness_report --queries dsl/sample_queries --output experiments/results/correctness_report_runtime.csv
$(PYTHON) -m experiments.publish_gate
@echo "Publishable strict evaluation complete."
publish-eval-strict: publish-eval ## Alias for strict publishable bundle
publish-gate: ## Validate strict publish artifacts and thresholds
$(PYTHON) -m experiments.publish_gate --min_max_feature_drop 0.005 --min_max_group_drop 0.005 --min_max_permutation_drop 0.05
publish-gate-native: ## Validate strict publish artifacts and require native TPCH parquet
$(PYTHON) -m experiments.publish_gate --require_native_tpch --min_max_feature_drop 0.005 --min_max_group_drop 0.005 --min_max_permutation_drop 0.05
status-snapshot: ## Generate one-page strict runtime status checklist
$(PYTHON) -m experiments.status_snapshot --output experiments/results/project_status_snapshot.md
correctness-native: ## Run correctness report requiring native TPCH parquet
$(PYTHON) -m experiments.correctness_report --queries dsl/sample_queries --output experiments/results/correctness_report_native_runtime.csv --require_native_tpch
clean: ## Remove generated data (keeps raw data)
rm -rf data/parquet/tpch data/parquet/snb
rm -rf data/graphs/*.parquet
rm -rf data/stats/*.json
rm -rf model/artifacts/*.pkl model/artifacts/*.json
rm -rf model/artifacts/analysis/
rm -rf training_data/labeled_runs.csv
clean-local-results: ## Remove local iterative result artifacts (keeps strict headline outputs)
rm -f experiments/results/*_default_check.csv
rm -f experiments/results/*_default_check.json
rm -f experiments/results/*_default_check.md
rm -f experiments/results/*_default_check_groups.csv
rm -f experiments/results/*_strict_recheck.json
rm -f experiments/results/*_strict_recheck.md
rm -f experiments/results/correctness_report_runtime.csv
rm -f experiments/results/ablation_debug_groups.csv