Skip to content

Commit d0237fb

Browse files
rizaonImpala Public Jenkins
authored andcommitted
IMPALA-8042: Assign BETWEEN selectivity for discrete-unique column
Impala frontend can not evaluate BETWEEN/NOT BETWEEN predicate directly. It needs to transform a BetweenPredicate into a CompoundPredicate consisting of upper bound and lower bound BinaryPredicate through BetweenToCompoundRule.java. The BinaryPredicate can then be pushed down or rewritten into other form by another expression rewrite rule. However, the selectivity of BetweenPredicate or its derivatives remains unassigned and often collapses with other unknown selectivity predicates to have collective selectivity equals Expr.DEFAULT_SELECTIVITY (0.1). This patch adds a narrow optimization of BetweenPredicate selectivity when the following criteria are met: 1. The BetweenPredicate is bound to a slot reference of a single column of a table. 2. The column type is discrete, such as INTEGER or DATE. 3. The column stats are available. 4. The column is sufficiently unique based on available stats. 5. The BETWEEN/NOT BETWEEN predicate is in good form (lower bound value <= upper bound value). 6. The final calculated selectivity is less than or equal to Expr.DEFAULT_SELECTIVITY. If these criteria are unmet, the Planner will revert to the old behavior, which is letting the selectivity unassigned. Since this patch only target BetweenPredicate over unique column, the following query will still have the default scan selectivity (0.1): select count(*) from tpch.customer c where c.c_custkey >= 1234 and c.c_custkey <= 2345; While this equivalent query written with BETWEEN predicate will have lower scan selectivity: select count(*) from tpch.customer c where c.c_custkey between 1234 and 2345; This patch calculates the BetweenPredicate selectivity during transformation at BetweenToCompoundRule.java. The selectivity is piggy-backed into the resulting CompoundPredicate and BinaryPredicate as betweenSelectivity_ field, separate from the selectivity_ field. Analyzer.getBoundPredicates() is modified to prioritize the derived BinaryPredicate over ordinary BinaryPredicate in its return value to prevent the derived BinaryPredicate from being eliminated by a matching ordinary BinaryPredicate. Testing: - Add table functional_parquet.unique_with_nulls. - Add FE tests in ExprCardinalityTest#testBetweenSelectivity, ExprCardinalityTest#testNotBetweenSelectivity, and PlannerTest#testScanCardinality. - Pass core tests. Change-Id: Ib349d97349d1ee99788645a66be1b81749684d10 Reviewed-on: http://gerrit.cloudera.org:8080/21377 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
1 parent 8a6f282 commit d0237fb

27 files changed

Lines changed: 3726 additions & 3320 deletions

fe/src/main/java/org/apache/impala/analysis/Analyzer.java

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2460,6 +2460,8 @@ public boolean canEvalAntiJoinedConjunct(Expr e, List<TupleId> nodeTupleIds) {
24602460
*/
24612461
public List<Expr> getBoundPredicates(TupleId destTid, Set<SlotId> ignoreSlots,
24622462
boolean markAssigned) {
2463+
// Map that tracks BinaryPredicates that derived from the same BetweenPredicate.
2464+
Map<ExprId, List<BinaryPredicate>> betweenPredicates = new HashMap<>();
24632465
List<Expr> result = new ArrayList<>();
24642466
for (ExprId srcConjunctId: globalState_.singleTidConjuncts) {
24652467
Expr srcConjunct = globalState_.conjuncts.get(srcConjunctId);
@@ -2610,9 +2612,31 @@ public List<Expr> getBoundPredicates(TupleId destTid, Set<SlotId> ignoreSlots,
26102612
}
26112613
}
26122614

2613-
// check if we already created this predicate
2614-
if (!result.contains(p)) result.add(p);
2615+
if ((p instanceof BinaryPredicate)
2616+
&& ((BinaryPredicate) p).derivedFromBetween()) {
2617+
BinaryPredicate b = (BinaryPredicate) p;
2618+
betweenPredicates.computeIfAbsent(b.getBetweenExprId(), k -> new ArrayList<>());
2619+
betweenPredicates.get(b.getBetweenExprId()).add(b);
2620+
} else {
2621+
// check if we already created this predicate
2622+
if (!result.contains(p)) result.add(p);
2623+
}
2624+
}
2625+
}
2626+
2627+
if (!betweenPredicates.isEmpty()) {
2628+
// Prioritize members of 'betweenPredicates' ahead of 'result'.
2629+
// BinaryPredicates that derived from BetweenPredicates may have lower selectivity
2630+
// estimate from BetweenToCompoundRule. Placing them in-front will ensure that
2631+
// they are retained over other matching BinaryPredicates that do not come from
2632+
// BetweenPredicates when passed through Expr.removeDuplicates().
2633+
List<Expr> prioritizedExprs = new ArrayList<>();
2634+
for (List<BinaryPredicate> predicates : betweenPredicates.values()) {
2635+
prioritizedExprs.addAll(predicates);
26152636
}
2637+
prioritizedExprs.addAll(result);
2638+
Expr.removeDuplicates(prioritizedExprs);
2639+
result = prioritizedExprs;
26162640
}
26172641
return result;
26182642
}

fe/src/main/java/org/apache/impala/analysis/BinaryPredicate.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ public class BinaryPredicate extends Predicate {
4646
// true if this BinaryPredicate is inferred from slot equivalences, false otherwise.
4747
private boolean isInferred_ = false;
4848

49+
private ExprId betweenExprId_ = null;
50+
private double betweenSelectivity_ = -1;
51+
4952
public enum Operator {
5053
EQ("=", "eq", TComparisonOp.EQ),
5154
NE("!=", "ne", TComparisonOp.NE),
@@ -155,6 +158,8 @@ protected BinaryPredicate(BinaryPredicate other) {
155158
super(other);
156159
op_ = other.op_;
157160
isInferred_ = other.isInferred_;
161+
betweenExprId_ = other.betweenExprId_;
162+
betweenSelectivity_ = other.betweenSelectivity_;
158163
}
159164

160165
public boolean isNullMatchingEq() { return op_ == Operator.NULL_MATCHING_EQ; }
@@ -193,6 +198,10 @@ public String debugString() {
193198
toStrHelper.add("op", op_).addValue(super.debugString());
194199
if (isAuxExpr()) toStrHelper.add("isAux", true);
195200
if (isInferred_) toStrHelper.add("isInferred", true);
201+
if (derivedFromBetween()) {
202+
toStrHelper.add("betweenExprId", betweenExprId_);
203+
toStrHelper.add("betweenSelectivity", betweenSelectivity_);
204+
}
196205
return toStrHelper.toString();
197206
}
198207

@@ -415,4 +424,13 @@ public boolean localEquals(Expr that) {
415424

416425
@Override
417426
public Expr clone() { return new BinaryPredicate(this); }
427+
428+
public void setBetweenSelectivity(ExprId betweenExprId, double betweenSelectivity) {
429+
betweenExprId_ = betweenExprId;
430+
betweenSelectivity_ = betweenSelectivity;
431+
}
432+
433+
public boolean derivedFromBetween() { return betweenExprId_ != null; }
434+
public ExprId getBetweenExprId() { return betweenExprId_; }
435+
public double getBetweenSelectivity() { return betweenSelectivity_; }
418436
}

fe/src/main/java/org/apache/impala/analysis/CompoundPredicate.java

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.apache.impala.thrift.TExprNodeType;
2929

3030
import com.google.common.base.MoreObjects;
31+
import com.google.common.base.MoreObjects.ToStringHelper;
3132
import com.google.common.base.Preconditions;
3233
import com.google.common.collect.Lists;
3334

@@ -54,6 +55,11 @@ public String toString() {
5455
}
5556
private final Operator op_;
5657

58+
// Selectivity estimate from BetweenToCompoundRule.computeBetweenSelectivity().
59+
// Only set if this CompoundPredicate is a rewrite from BetweenPredicate.
60+
// Otherwise, set to -1.
61+
private final double betweenSelectivity_;
62+
5763
public static void initBuiltins(Db db) {
5864
// AND and OR are implemented as custom exprs, so they do not have a function symbol.
5965
db.addBuiltin(ScalarFunction.createBuiltinOperator(
@@ -67,7 +73,13 @@ public static void initBuiltins(Db db) {
6773
Lists.<Type>newArrayList(Type.BOOLEAN), Type.BOOLEAN));
6874
}
6975

70-
public CompoundPredicate(Operator op, Expr e1, Expr e2) {
76+
public static CompoundPredicate createFromBetweenPredicate(Operator op,
77+
BinaryPredicate lower, BinaryPredicate upper, double betweenSelectivity) {
78+
Preconditions.checkArgument(betweenSelectivity >= 0.0);
79+
return new CompoundPredicate(op, lower, upper, betweenSelectivity);
80+
}
81+
82+
private CompoundPredicate(Operator op, Expr e1, Expr e2, double betweenSelectivity) {
7183
super();
7284
this.op_ = op;
7385
Preconditions.checkNotNull(e1);
@@ -84,14 +96,18 @@ public CompoundPredicate(Operator op, Expr e1, Expr e2) {
8496
setHasAlwaysTrueHint(true);
8597
}
8698
}
99+
betweenSelectivity_ = betweenSelectivity;
87100
}
88101

102+
public CompoundPredicate(Operator op, Expr e1, Expr e2) { this(op, e1, e2, -1); }
103+
89104
/**
90105
* Copy c'tor used in clone().
91106
*/
92107
protected CompoundPredicate(CompoundPredicate other) {
93108
super(other);
94109
op_ = other.op_;
110+
betweenSelectivity_ = other.betweenSelectivity_;
95111
}
96112

97113
public Operator getOp() { return op_; }
@@ -103,10 +119,11 @@ public boolean localEquals(Expr that) {
103119

104120
@Override
105121
public String debugString() {
106-
return MoreObjects.toStringHelper(this)
122+
ToStringHelper helper = MoreObjects.toStringHelper(this)
107123
.add("op", op_)
108-
.addValue(super.debugString())
109-
.toString();
124+
.addValue(super.debugString());
125+
if (betweenSelectivity_ != -1) helper.add("betweenSelectivity", betweenSelectivity_);
126+
return helper.toString();
110127
}
111128

112129
@Override
@@ -163,8 +180,11 @@ protected void computeSelectivity(Analyzer analyzer) {
163180

164181
@Deprecated
165182
protected void computeSelectivity() {
166-
if (!getChild(0).hasSelectivity() ||
167-
(children_.size() == 2 && !getChild(1).hasSelectivity())) {
183+
if (betweenSelectivity_ != -1) {
184+
selectivity_ = betweenSelectivity_;
185+
return;
186+
} else if (!getChild(0).hasSelectivity()
187+
|| (children_.size() == 2 && !getChild(1).hasSelectivity())) {
168188
// Give up if one of our children has an unknown selectivity.
169189
selectivity_ = -1;
170190
return;
@@ -238,4 +258,6 @@ public static Expr createConjunction(Expr lhs, Expr rhs) {
238258
if (rhs == null) return lhs;
239259
return new CompoundPredicate(Operator.AND, rhs, lhs);
240260
}
261+
262+
public boolean derivedFromBetween() { return betweenSelectivity_ != -1; }
241263
}

fe/src/main/java/org/apache/impala/analysis/Expr.java

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1851,9 +1851,11 @@ public static String getExplainString(
18511851
* Analyzes and evaluates expression to an integral value, returned as a long.
18521852
* Throws if the expression cannot be evaluated or if the value evaluates to null.
18531853
* The 'name' parameter is used in exception messages, e.g. "LIMIT expression
1854-
* evaluates to NULL".
1854+
* evaluates to NULL". If 'acceptDate' is true, treat Date expression as integral
1855+
* expression as well.
18551856
*/
1856-
public long evalToInteger(Analyzer analyzer, String name) throws AnalysisException {
1857+
public long evalToInteger(Analyzer analyzer, String name, boolean acceptDate)
1858+
throws AnalysisException {
18571859
// Check for slotrefs and subqueries before analysis so we can provide a more
18581860
// helpful error message.
18591861
if (contains(SlotRef.class) || contains(Subquery.class)) {
@@ -1865,7 +1867,7 @@ public long evalToInteger(Analyzer analyzer, String name) throws AnalysisExcepti
18651867
throw new AnalysisException(name + " expression must be a constant expression: " +
18661868
toSql());
18671869
}
1868-
if (!getType().isIntegerType()) {
1870+
if (!getType().isIntegerType() && !(acceptDate && getType().isDate())) {
18691871
throw new AnalysisException(name + " expression must be an integer type but is '" +
18701872
getType() + "': " + toSql());
18711873
}
@@ -1875,6 +1877,16 @@ public long evalToInteger(Analyzer analyzer, String name) throws AnalysisExcepti
18751877
} catch (InternalException e) {
18761878
throw new AnalysisException("Failed to evaluate expr: " + toSql(), e);
18771879
}
1880+
1881+
try {
1882+
return evalToInteger(val, acceptDate);
1883+
} catch (AnalysisException e) {
1884+
throw new AnalysisException(name + " expression evaluates to NULL: " + toSql());
1885+
}
1886+
}
1887+
1888+
public static long evalToInteger(TColumnValue val, boolean acceptDate)
1889+
throws AnalysisException {
18781890
long value;
18791891
if (val.isSetLong_val()) {
18801892
value = val.getLong_val();
@@ -1884,12 +1896,18 @@ public long evalToInteger(Analyzer analyzer, String name) throws AnalysisExcepti
18841896
value = val.getShort_val();
18851897
} else if (val.isSetByte_val()) {
18861898
value = val.getByte_val();
1899+
} else if (acceptDate && val.isSetDate_val()) {
1900+
value = val.getDate_val();
18871901
} else {
1888-
throw new AnalysisException(name + " expression evaluates to NULL: " + toSql());
1902+
throw new AnalysisException("TColumnValue evaluates to NULL: " + val);
18891903
}
18901904
return value;
18911905
}
18921906

1907+
public long evalToInteger(Analyzer analyzer, String name) throws AnalysisException {
1908+
return evalToInteger(analyzer, name, false);
1909+
}
1910+
18931911
/**
18941912
* Analyzes and evaluates expression to a non-negative integral value, returned as a
18951913
* long. Throws if the expression cannot be evaluated, if the value evaluates to null,

fe/src/main/java/org/apache/impala/catalog/Type.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ public boolean isDateOrTimeType() {
234234
|| isScalarType(PrimitiveType.TIMESTAMP);
235235
}
236236

237+
public boolean isIntegerOrDateType() { return isIntegerType() || isDate(); }
237238
public boolean isComplexType() { return isStructType() || isCollectionType(); }
238239
public boolean isCollectionType() { return isMapType() || isArrayType(); }
239240
public boolean isMapType() { return this instanceof MapType; }

fe/src/main/java/org/apache/impala/planner/PlanNode.java

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,22 @@
2121
import java.util.Arrays;
2222
import java.util.Collection;
2323
import java.util.Collections;
24-
import java.util.Iterator;
24+
import java.util.HashMap;
2525
import java.util.HashSet;
26+
import java.util.Iterator;
2627
import java.util.List;
28+
import java.util.Map;
2729
import java.util.Set;
2830
import java.util.Stack;
2931

3032
import org.apache.impala.analysis.Analyzer;
3133
import org.apache.impala.analysis.BinaryPredicate;
34+
import org.apache.impala.analysis.CompoundPredicate;
3235
import org.apache.impala.analysis.Expr;
3336
import org.apache.impala.analysis.ExprId;
3437
import org.apache.impala.analysis.ExprSubstitutionMap;
3538
import org.apache.impala.analysis.SlotDescriptor;
39+
import org.apache.impala.analysis.SlotId;
3640
import org.apache.impala.analysis.SlotRef;
3741
import org.apache.impala.analysis.ToSqlOptions;
3842
import org.apache.impala.analysis.TupleDescriptor;
@@ -751,19 +755,59 @@ protected void computeMemLayout(Analyzer analyzer) {
751755
* 1. The individual selectivities of conjuncts may be unknown.
752756
* 2. Two selectivities, whether known or unknown, could be correlated. Assuming
753757
* independence can lead to significant underestimation.
758+
* 3. Two BinaryPredicate may be derived from single BetweenPredicate that have
759+
* lower selectivity if analyzed as a pair.
754760
*
755761
* The first issue is addressed by using a single default selectivity that is
756762
* representative of all conjuncts with unknown selectivities.
757763
* The second issue is addressed by an exponential backoff when multiplying each
758764
* additional selectivity into the final result.
765+
* The third issue is addressed by checking BinaryPredicate.derivedFromBetween()
766+
* property. If it is True, calculate the expression selectivity from
767+
* BinaryPredicate.getBetweenSelectivity(). Both of the BinaryPredicates should have
768+
* the same getBetweenSelectivity() value, thus it is OK to inspect just from one of
769+
* them.
770+
* TODO: Fix the third issue with more general solution when there are multiple range
771+
* predicates involved, even if some of them are not derived from BetweenPredicate.
759772
*/
760773
static protected double computeCombinedSelectivity(List<Expr> conjuncts) {
761774
// Collect all estimated selectivities.
762775
List<Double> selectivities = new ArrayList<>();
776+
// Map between a slot id and the lowest between selectivity targeting that slot.
777+
Map<SlotId, Double> perSlotBetweenSelectivities = new HashMap<>();
778+
779+
int conjunctSize = 0;
763780
for (Expr e: conjuncts) {
764-
if (e.hasSelectivity()) selectivities.add(e.getSelectivity());
781+
if (e instanceof BinaryPredicate && ((BinaryPredicate) e).derivedFromBetween()) {
782+
// This is one of two BinaryPredicate that derived from a BetweenPredicate.
783+
// A pair of BetweenPredicate must have been assigned and pushed down to the same
784+
// 'targetSlotId', or one of them maybe removed by Expr.removeDuplicates()
785+
// somewhere by Analyzer. But it is enough to get the selectivity from one of
786+
// them. Analyzer.getBoundPredicates() ensure that BinaryPredicate derived from
787+
// BetweenPredicate gets priority over regular BinaryPredicate.
788+
// If multiple BetweenPredicate target the same 'targetSlotId', pick the least
789+
// selectivity.
790+
BinaryPredicate pred = (BinaryPredicate) e;
791+
insertBetweenSelectivity(pred, perSlotBetweenSelectivities);
792+
} else if (e instanceof CompoundPredicate
793+
&& ((CompoundPredicate) e).derivedFromBetween()
794+
&& (((CompoundPredicate) e).getChild(0) instanceof BinaryPredicate)) {
795+
BinaryPredicate pred = (BinaryPredicate) ((CompoundPredicate) e).getChild(0);
796+
insertBetweenSelectivity(pred, perSlotBetweenSelectivities);
797+
} else {
798+
// For everything else.
799+
if (e.hasSelectivity()) selectivities.add(e.getSelectivity());
800+
conjunctSize++;
801+
}
765802
}
766-
if (selectivities.size() != conjuncts.size()) {
803+
804+
// Add all values from perSlotBetweenSelectivities to selectivities.
805+
if (!perSlotBetweenSelectivities.isEmpty()) {
806+
selectivities.addAll(perSlotBetweenSelectivities.values());
807+
conjunctSize += perSlotBetweenSelectivities.size();
808+
}
809+
810+
if (selectivities.size() != conjunctSize) {
767811
// Some conjuncts have no estimated selectivity. Use a single default
768812
// representative selectivity for all those conjuncts.
769813
selectivities.add(Expr.DEFAULT_SELECTIVITY);
@@ -781,6 +825,15 @@ static protected double computeCombinedSelectivity(List<Expr> conjuncts) {
781825
return Math.max(0.0, Math.min(1.0, result));
782826
}
783827

828+
static private void insertBetweenSelectivity(
829+
BinaryPredicate pred, Map<SlotId, Double> perSlotBetweenSelectivities) {
830+
Preconditions.checkNotNull(pred.getBoundSlot());
831+
Preconditions.checkState(pred.getBetweenSelectivity() >= 0.0);
832+
SlotId targetSlotId = pred.getBoundSlot().getSlotId();
833+
double sel = pred.getBetweenSelectivity();
834+
perSlotBetweenSelectivities.merge(targetSlotId, sel, (v1, v2) -> Math.min(v1, v2));
835+
}
836+
784837
protected double computeSelectivity() {
785838
return computeCombinedSelectivity(conjuncts_);
786839
}

0 commit comments

Comments
 (0)