検索エンジン OSS 勉強会 第二回の発表資料です。発表者: gteu
ef_search
パラメータを指定できないことで、Latency や Recall に影響が出る可能性があるKNNWeight.java
の scorer
にある @Override
public Scorer scorer(LeafReaderContext context) throws IOException {
final BitSet filterBitSet = getFilteredDocsBitSet(context);
int cardinality = filterBitSet.cardinality();
// We don't need to go to JNI layer if no documents are found which satisfy the filters
// We should give this condition a deeper look that where it should be placed. For now I feel this is a good
// place,
if (filterWeight != null && cardinality == 0) {
return KNNScorer.emptyScorer(this);
}
final Map<Integer, Float> docIdsToScoreMap = new HashMap<>();
/*
* The idea for this optimization is to get K results, we need to atleast look at K vectors in the HNSW graph
* . Hence, if filtered results are less than K and filter query is present we should shift to exact search.
* This improves the recall.
*/
if (filterWeight != null && canDoExactSearch(cardinality)) {
docIdsToScoreMap.putAll(doExactSearch(context, filterBitSet, cardinality));
} else {
Map<Integer, Float> annResults = doANNSearch(context, filterBitSet, cardinality);
if (annResults == null) {
return null;
}
if (canDoExactSearchAfterANNSearch(cardinality, annResults.size())) {
log.debug(
"Doing ExactSearch after doing ANNSearch as the number of documents returned are less than "
+ "K, even when we have more than K filtered Ids. K: {}, ANNResults: {}, filteredIdCount: {}",
knnQuery.getK(),
annResults.size(),
cardinality
);
annResults = doExactSearch(context, filterBitSet, cardinality);
}
docIdsToScoreMap.putAll(annResults);
}
if (docIdsToScoreMap.isEmpty()) {
return KNNScorer.emptyScorer(this);
}
return convertSearchResponseToScorer(docIdsToScoreMap);
}
private boolean canDoExactSearch(final int filterIdsCount) {
log.debug(
"Info for doing exact search filterIdsLength : {}, Threshold value: {}",
filterIdsCount,
KNNSettings.getFilteredExactSearchThreshold(knnQuery.getIndexName())
);
if (knnQuery.getRadius() != null) {
return false;
}
int filterThresholdValue = KNNSettings.getFilteredExactSearchThreshold(knnQuery.getIndexName());
// Refer this GitHub around more details <https://github.com/opensearch-project/k-NN/issues/1049> on the logic
if (filterIdsCount <= knnQuery.getK()) {
return true;
}
// See user has defined Exact Search filtered threshold. if yes, then use that setting.
if (isExactSearchThresholdSettingSet(filterThresholdValue)) {
return filterThresholdValue >= filterIdsCount;
}
// if no setting is set, then use the default max distance computation value to see if we can do exact search.
return KNNConstants.MAX_DISTANCE_COMPUTATIONS >= filterIdsCount * knnQuery.getQueryVector().length;
}
/**
* This function validates if {@link KNNSettings#ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD} is set or not. This
* is done by validating if the setting value is equal to the default value.
* @param filterThresholdValue value of the Index Setting: {@link KNNSettings#ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_SETTING}
* @return boolean true if the setting is set.
*/
private boolean isExactSearchThresholdSettingSet(int filterThresholdValue) {
return filterThresholdValue != KNNSettings.ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_DEFAULT_VALUE;
}
/**
* This condition mainly checks during filtered search we have more than K elements in filterIds but the ANN
* doesn't yeild K nearest neighbors.
* @param filterIdsCount count of filtered Doc ids
* @param annResultCount Count of Nearest Neighbours we got after doing filtered ANN Search.
* @return boolean - true if exactSearch needs to be done after ANNSearch.
*/
private boolean canDoExactSearchAfterANNSearch(final int filterIdsCount, final int annResultCount) {
return filterWeight != null && filterIdsCount >= knnQuery.getK() && knnQuery.getK() > annResultCount;
}