# Elastic search grouping solution # As at present ElasticSearch does not provide a group_by equivalent, here's my attempt to do it manually. # In the example we have articles made by some authors and I'd like to have relevant docs, but not more than one per author. # Assumption. # # 1) I'm looking for relevant content # 2) I've assumed that first 300 docs are relevant, # So I consider only this selection, regardless many of these are from the same few authors. # 3) for my needs I didn't "really" needed pagination, for me it was enough a "show more" button updated through ajax `curl -X DELETE "http://localhost:9200/articles" curl -X PUT "http://localhost:9200/articles" -d '{ "settings": { "index": { "number_of_shards": 1, "number_of_replicas": 0 } } }' curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 111, "author_id": "user_1", "title": "One bad doc", "findable": true }' curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 222, "author_id": "user_2", "title": "Two bad doc", "findable": true }' curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 333, "author_id": "user_3", "title": "Three good doc", "findable": true }' curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 444, "author_id": "user_1", "title": "Four good doc", "findable": true }' curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 555, "author_id": "user_2", "title": "Five good doc", "findable": true }' curl -X POST "http://localhost:9200/articles/article" -d '{ "id": 666, "author_id": "user_1", "title": "Six good doc", "findable": true }' curl -XPOST 'http://localhost:9200/articles/_refresh'` # # Raw test our query # # curl -X POST "http://localhost:9200/articles/_search?pretty=true" -d '{ # "query": { # "bool":{ # "must":[{ "query_string":{ "query":"doc", "default_operator":"AND" } }], # "should":[{ "query_string":{ "query":"user_2", "default_operator":"AND", "boost":2000 } }] # } # }, # "fields": [{ "term": { findable: "true" } }], # "facets": { # "tags": { "terms": {"field": "owner", "size": 10} } # } # }' params_start_from = 0 per_page = 3 my_query = { bool: { must: [{ query_string: { query: "doc", default_operator: "AND" } }], should: [{ query_string: { query: "user_2", default_operator: "AND", boost: 2000 } }] } } my_and_filters = [ { term: { findable: "true" } } ] # FIRST QUERY - find all relevant ids all_res = Tire.search 'articles', query: my_query, filter: { :and => my_and_filters }, fields: ['id', 'author_id'], size: 300 docs = all_res.results.to_a.uniq { |el| el['author_id'] } @total_results_non_unique = all_res.results.total # <-- Global variable @total_results = docs.size # <-- Global variable # PAGINATION start_from = params_start_from.to_i # should always be < Settings.research.max_results docs = docs[ start_from .. start_from + per_page - 1 ] doc_ids = docs.nil? ? [] : docs.map { |doc| doc['id'] } # SECOND QUERY, FIND BY ID and_filters << { ids: { values: doc_ids } } # TODO: move :highlight to Part 1 and query only by :id res = Tire.search 'articles', query: my_query, filter: { :and => my_and_filters }, highlight: { fields: ['title'] }, size: per_page