Advanced Search: Use reverse filename index
What does this MR do and why?
This feature is behind a feature flag called elastic_file_name_reverse_optimization
.
Disclaimer: You should only use this feature flag with a newly created or reindexed index. The current plan is to reindex gitlab-production
after this is merged in gitlab-com/gl-infra/production#6116 (closed)
This MR adds a new multi-field to our main index mappings in order to improve the efficiency of the Advanced Search extension
filter. Currently it uses a wildcard search on path
(for example, *.rb
), which is really expensive. With the new multi-field, we'll have file_name
indexed in reverse and we'll be able to utilize this index to improve query performance.
Click to see old query
{
"query": {
"bool": {
"must": {
"simple_query_string": {
"_name": "blob:match:search_terms",
"fields": [
"blob.content",
"blob.file_name",
"blob.path"
],
"query": "*",
"default_operator": "and"
}
},
"must_not": [],
"should": [],
"filter": [
{
"has_parent": {
"_name": "blob:authorized:project",
"parent_type": "project",
"query": {
"bool": {
"should": [
{
"bool": {
"filter": [
{
"term": {
"visibility_level": {
"_name": "blob:authorized:project:any",
"value": 0
}
}
},
{
"terms": {
"_name": "blob:authorized:project:repository:enabled_or_private",
"repository_access_level": [
20,
10
]
}
}
]
}
},
{
"bool": {
"_name": "blob:authorized:project:visibility:10:repository:access_level",
"filter": [
{
"term": {
"visibility_level": {
"_name": "blob:authorized:project:visibility:10",
"value": 10
}
}
},
{
"terms": {
"_name": "blob:authorized:project:visibility:10:repository:access_level:enabled_or_private",
"repository_access_level": [
20,
10
]
}
}
]
}
},
{
"bool": {
"_name": "blob:authorized:project:visibility:20:repository:access_level",
"filter": [
{
"term": {
"visibility_level": {
"_name": "blob:authorized:project:visibility:20",
"value": 20
}
}
},
{
"terms": {
"_name": "blob:authorized:project:visibility:20:repository:access_level:enabled_or_private",
"repository_access_level": [
20,
10
]
}
}
]
}
}
]
}
}
}
},
{
"term": {
"type": {
"_name": "doc:is_a:blob",
"value": "blob"
}
}
},
{
"wildcard": {
"blob.path": "*.js"
}
}
]
}
},
"size": 20,
"from": 0,
"sort": [
"_score"
],
"highlight": {
"pre_tags": [
"gitlabelasticsearch→"
],
"post_tags": [
"←gitlabelasticsearch"
],
"number_of_fragments": 0,
"fields": {
"blob.content": {},
"blob.file_name": {}
}
}
}
Click to see new query
{
"query": {
"bool": {
"must": {
"simple_query_string": {
"_name": "blob:match:search_terms",
"fields": [
"blob.content",
"blob.file_name",
"blob.path"
],
"query": "*",
"default_operator": "and"
}
},
"must_not": [],
"should": [],
"filter": [
{
"has_parent": {
"_name": "blob:authorized:project",
"parent_type": "project",
"query": {
"bool": {
"should": [
{
"bool": {
"filter": [
{
"term": {
"visibility_level": {
"_name": "blob:authorized:project:any",
"value": 0
}
}
},
{
"terms": {
"_name": "blob:authorized:project:repository:enabled_or_private",
"repository_access_level": [
20,
10
]
}
}
]
}
},
{
"bool": {
"_name": "blob:authorized:project:visibility:10:repository:access_level",
"filter": [
{
"term": {
"visibility_level": {
"_name": "blob:authorized:project:visibility:10",
"value": 10
}
}
},
{
"terms": {
"_name": "blob:authorized:project:visibility:10:repository:access_level:enabled_or_private",
"repository_access_level": [
20,
10
]
}
}
]
}
},
{
"bool": {
"_name": "blob:authorized:project:visibility:20:repository:access_level",
"filter": [
{
"term": {
"visibility_level": {
"_name": "blob:authorized:project:visibility:20",
"value": 20
}
}
},
{
"terms": {
"_name": "blob:authorized:project:visibility:20:repository:access_level:enabled_or_private",
"repository_access_level": [
20,
10
]
}
}
]
}
}
]
}
}
}
},
{
"term": {
"type": {
"_name": "doc:is_a:blob",
"value": "blob"
}
}
},
{
"prefix": {
"blob.file_name.reverse": "sj."
}
}
]
}
},
"size": 20,
"from": 0,
"sort": [
"_score"
],
"highlight": {
"pre_tags": [
"gitlabelasticsearch→"
],
"post_tags": [
"←gitlabelasticsearch"
],
"number_of_fragments": 0,
"fields": {
"blob.content": {},
"blob.file_name": {}
}
}
}
Click to see query diff
diff --git a/query.json b/query.json
index 1e31b3e..e8ef073 100644
--- a/query.json
+++ b/query.json
@@ -108,8 +108,8 @@
}
},
{
- "wildcard": {
- "blob.path": "*.js"
+ "prefix": {
+ "blob.file_name.reverse": "sj."
}
}
]
Useful links
- https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-reverse-tokenfilter.html
- https://www.elastic.co/guide/en/elasticsearch/reference/7.16/multi-fields.html
Screenshots or screen recordings
These are strongly recommended to assist reviewers and reduce the time to merge your change.
How to set up and validate locally
- Checkout the branch of this MR
- Enable Advanced Search if it's not configured
- If your index was created before that, you'll need to reindex everything using Elasticsearch zero-downtime reindexing to apply new mappings. Please ensure this step is finished before processing to the next one. You can run this command to force faster reindexing
while true; ElasticClusterReindexingCronWorker.new.perform; sleep 1; end
- Search for
extension:js
usingCode
tab (http://localhost:3000/search?repository_ref=&scope=blobs&search=extension%3Ajs&snippets=) - Ensure that it returns results with
*.js
files - Enable the feature flag
Feature.enable(:elastic_file_name_reverse_optimization)
- Perform the search again and ensure that the results are the same
MR acceptance checklist
This checklist encourages us to confirm any changes have been analyzed to reduce risks in quality, performance, reliability, security, and maintainability.
-
I have evaluated the MR acceptance checklist for this MR.
Related to #349099 (closed)