検索エンジン OSS 勉強会 第二回の発表資料です。発表者: gteu

ANN における Filtering とは

OpenSearch の ANN と Filtering

Faiss Efficient k-NN filtering のフロー

Untitled

    @Override
    public Scorer scorer(LeafReaderContext context) throws IOException {

        final BitSet filterBitSet = getFilteredDocsBitSet(context);
        int cardinality = filterBitSet.cardinality();
        // We don't need to go to JNI layer if no documents are found which satisfy the filters
        // We should give this condition a deeper look that where it should be placed. For now I feel this is a good
        // place,
        if (filterWeight != null && cardinality == 0) {
            return KNNScorer.emptyScorer(this);
        }
        final Map<Integer, Float> docIdsToScoreMap = new HashMap<>();

        /*
         * The idea for this optimization is to get K results, we need to atleast look at K vectors in the HNSW graph
         * . Hence, if filtered results are less than K and filter query is present we should shift to exact search.
         * This improves the recall.
         */
        if (filterWeight != null && canDoExactSearch(cardinality)) {
            docIdsToScoreMap.putAll(doExactSearch(context, filterBitSet, cardinality));
        } else {
            Map<Integer, Float> annResults = doANNSearch(context, filterBitSet, cardinality);
            if (annResults == null) {
                return null;
            }
            if (canDoExactSearchAfterANNSearch(cardinality, annResults.size())) {
                log.debug(
                    "Doing ExactSearch after doing ANNSearch as the number of documents returned are less than "
                        + "K, even when we have more than K filtered Ids. K: {}, ANNResults: {}, filteredIdCount: {}",
                    knnQuery.getK(),
                    annResults.size(),
                    cardinality
                );
                annResults = doExactSearch(context, filterBitSet, cardinality);
            }
            docIdsToScoreMap.putAll(annResults);
        }
        if (docIdsToScoreMap.isEmpty()) {
            return KNNScorer.emptyScorer(this);
        }
        return convertSearchResponseToScorer(docIdsToScoreMap);
    }
    
    private boolean canDoExactSearch(final int filterIdsCount) {
        log.debug(
            "Info for doing exact search filterIdsLength : {}, Threshold value: {}",
            filterIdsCount,
            KNNSettings.getFilteredExactSearchThreshold(knnQuery.getIndexName())
        );
        if (knnQuery.getRadius() != null) {
            return false;
        }
        int filterThresholdValue = KNNSettings.getFilteredExactSearchThreshold(knnQuery.getIndexName());
        // Refer this GitHub around more details <https://github.com/opensearch-project/k-NN/issues/1049> on the logic
        if (filterIdsCount <= knnQuery.getK()) {
            return true;
        }
        // See user has defined Exact Search filtered threshold. if yes, then use that setting.
        if (isExactSearchThresholdSettingSet(filterThresholdValue)) {
            return filterThresholdValue >= filterIdsCount;
        }
        // if no setting is set, then use the default max distance computation value to see if we can do exact search.
        return KNNConstants.MAX_DISTANCE_COMPUTATIONS >= filterIdsCount * knnQuery.getQueryVector().length;
    }

    /**
     *  This function validates if {@link KNNSettings#ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD} is set or not. This
     *  is done by validating if the setting value is equal to the default value.
     * @param filterThresholdValue value of the Index Setting: {@link KNNSettings#ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_SETTING}
     * @return boolean true if the setting is set.
     */
    private boolean isExactSearchThresholdSettingSet(int filterThresholdValue) {
        return filterThresholdValue != KNNSettings.ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_DEFAULT_VALUE;
    }

    /**
     * This condition mainly checks during filtered search we have more than K elements in filterIds but the ANN
     * doesn't yeild K nearest neighbors.
     * @param filterIdsCount count of filtered Doc ids
     * @param annResultCount Count of Nearest Neighbours we got after doing filtered ANN Search.
     * @return boolean - true if exactSearch needs to be done after ANNSearch.
     */
    private boolean canDoExactSearchAfterANNSearch(final int filterIdsCount, final int annResultCount) {
        return filterWeight != null && filterIdsCount >= knnQuery.getK() && knnQuery.getK() > annResultCount;
    }

Untitled