Add more accurate hashtag search (#11579)
* Add more accurate hashtag search Using ElasticSearch to index hashtags with edge n-grams and score them by usage within the last 7 days since last activity. Only hashtags that have been reviewed and are listable can appear in searches, unless they match the query exactly * Fix search analyzer dropping non-ascii characters
This commit is contained in:
		
							
								
								
									
										37
									
								
								app/chewy/tags_index.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								app/chewy/tags_index.rb
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,37 @@
 | 
			
		||||
# frozen_string_literal: true
 | 
			
		||||
 | 
			
		||||
class TagsIndex < Chewy::Index
 | 
			
		||||
  settings index: { refresh_interval: '15m' }, analysis: {
 | 
			
		||||
    analyzer: {
 | 
			
		||||
      content: {
 | 
			
		||||
        tokenizer: 'keyword',
 | 
			
		||||
        filter: %w(lowercase asciifolding cjk_width),
 | 
			
		||||
      },
 | 
			
		||||
 | 
			
		||||
      edge_ngram: {
 | 
			
		||||
        tokenizer: 'edge_ngram',
 | 
			
		||||
        filter: %w(lowercase asciifolding cjk_width),
 | 
			
		||||
      },
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    tokenizer: {
 | 
			
		||||
      edge_ngram: {
 | 
			
		||||
        type: 'edge_ngram',
 | 
			
		||||
        min_gram: 2,
 | 
			
		||||
        max_gram: 15,
 | 
			
		||||
      },
 | 
			
		||||
    },
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  define_type ::Tag.listable, delete_if: ->(tag) { tag.destroyed? || !tag.listable? } do
 | 
			
		||||
    root date_detection: false do
 | 
			
		||||
      field :name, type: 'text', analyzer: 'content' do
 | 
			
		||||
        field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      field :reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? }
 | 
			
		||||
      field :usage, type: 'long', value: ->(tag) { tag.history.reduce(0) { |total, day| total + day[:accounts].to_i } }
 | 
			
		||||
      field :last_status_at, type: 'date', value: ->(tag) { tag.last_status_at || tag.created_at }
 | 
			
		||||
    end
 | 
			
		||||
  end
 | 
			
		||||
end
 | 
			
		||||
@ -13,6 +13,8 @@
 | 
			
		||||
#  listable            :boolean
 | 
			
		||||
#  reviewed_at         :datetime
 | 
			
		||||
#  requested_review_at :datetime
 | 
			
		||||
#  last_status_at      :datetime
 | 
			
		||||
#  last_trend_at       :datetime
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
class Tag < ApplicationRecord
 | 
			
		||||
@ -33,7 +35,8 @@ class Tag < ApplicationRecord
 | 
			
		||||
  scope :unreviewed, -> { where(reviewed_at: nil) }
 | 
			
		||||
  scope :pending_review, -> { unreviewed.where.not(requested_review_at: nil) }
 | 
			
		||||
  scope :usable, -> { where(usable: [true, nil]) }
 | 
			
		||||
  scope :discoverable, -> { where(listable: [true, nil]).joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) }
 | 
			
		||||
  scope :listable, -> { where(listable: [true, nil]) }
 | 
			
		||||
  scope :discoverable, -> { listable.joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) }
 | 
			
		||||
  scope :most_used, ->(account) { joins(:statuses).where(statuses: { account: account }).group(:id).order(Arel.sql('count(*) desc')) }
 | 
			
		||||
 | 
			
		||||
  delegate :accounts_count,
 | 
			
		||||
@ -44,6 +47,8 @@ class Tag < ApplicationRecord
 | 
			
		||||
 | 
			
		||||
  after_save :save_account_tag_stat
 | 
			
		||||
 | 
			
		||||
  update_index('tags#tag', :self) if Chewy.enabled?
 | 
			
		||||
 | 
			
		||||
  def account_tag_stat
 | 
			
		||||
    super || build_account_tag_stat
 | 
			
		||||
  end
 | 
			
		||||
@ -121,9 +126,10 @@ class Tag < ApplicationRecord
 | 
			
		||||
      normalized_term = normalize(term.strip).mb_chars.downcase.to_s
 | 
			
		||||
      pattern         = sanitize_sql_like(normalized_term) + '%'
 | 
			
		||||
 | 
			
		||||
      Tag.where(arel_table[:name].lower.matches(pattern))
 | 
			
		||||
         .where(arel_table[:score].gt(0).or(arel_table[:name].lower.eq(normalized_term)))
 | 
			
		||||
         .order(Arel.sql('length(name) ASC, score DESC, name ASC'))
 | 
			
		||||
      Tag.listable
 | 
			
		||||
         .where(arel_table[:name].lower.matches(pattern))
 | 
			
		||||
         .where(arel_table[:name].lower.eq(normalized_term).or(arel_table[:reviewed_at].not_eq(nil)))
 | 
			
		||||
         .order(Arel.sql('length(name) ASC, name ASC'))
 | 
			
		||||
         .limit(limit)
 | 
			
		||||
         .offset(offset)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
@ -17,6 +17,9 @@ class TrendingTags
 | 
			
		||||
      increment_historical_use!(tag.id, at_time)
 | 
			
		||||
      increment_unique_use!(tag.id, account.id, at_time)
 | 
			
		||||
      increment_vote!(tag, at_time)
 | 
			
		||||
 | 
			
		||||
      tag.update(last_status_at: Time.now.utc) if tag.last_status_at.nil? || tag.last_status_at < 12.hours.ago
 | 
			
		||||
      tag.update(last_trend_at: Time.now.utc)  if trending?(tag) && (tag.last_trend_at.nil? || tag.last_trend_at < 12.hours.ago)
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    def get(limit, filtered: true)
 | 
			
		||||
 | 
			
		||||
@ -109,7 +109,7 @@ class AccountSearchService < BaseService
 | 
			
		||||
      field_value_factor: {
 | 
			
		||||
        field: 'followers_count',
 | 
			
		||||
        modifier: 'log2p',
 | 
			
		||||
        missing: 1,
 | 
			
		||||
        missing: 0,
 | 
			
		||||
      },
 | 
			
		||||
    }
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
@ -57,10 +57,10 @@ class SearchService < BaseService
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def perform_hashtags_search!
 | 
			
		||||
    Tag.search_for(
 | 
			
		||||
      @query.gsub(/\A#/, ''),
 | 
			
		||||
      @limit,
 | 
			
		||||
      @offset
 | 
			
		||||
    TagSearchService.new.call(
 | 
			
		||||
      @query,
 | 
			
		||||
      limit: @limit,
 | 
			
		||||
      offset: @offset
 | 
			
		||||
    )
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										82
									
								
								app/services/tag_search_service.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								app/services/tag_search_service.rb
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,82 @@
 | 
			
		||||
# frozen_string_literal: true
 | 
			
		||||
 | 
			
		||||
class TagSearchService < BaseService
 | 
			
		||||
  def call(query, options = {})
 | 
			
		||||
    @query  = query.strip.gsub(/\A#/, '')
 | 
			
		||||
    @offset = options[:offset].to_i
 | 
			
		||||
    @limit  = options[:limit].to_i
 | 
			
		||||
 | 
			
		||||
    if Chewy.enabled?
 | 
			
		||||
      from_elasticsearch
 | 
			
		||||
    else
 | 
			
		||||
      from_database
 | 
			
		||||
    end
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  private
 | 
			
		||||
 | 
			
		||||
  def from_elasticsearch
 | 
			
		||||
    query = {
 | 
			
		||||
      function_score: {
 | 
			
		||||
        query: {
 | 
			
		||||
          multi_match: {
 | 
			
		||||
            query: @query,
 | 
			
		||||
            fields: %w(name.edge_ngram name),
 | 
			
		||||
            type: 'most_fields',
 | 
			
		||||
            operator: 'and',
 | 
			
		||||
          },
 | 
			
		||||
        },
 | 
			
		||||
 | 
			
		||||
        functions: [
 | 
			
		||||
          {
 | 
			
		||||
            field_value_factor: {
 | 
			
		||||
              field: 'usage',
 | 
			
		||||
              modifier: 'log2p',
 | 
			
		||||
              missing: 0,
 | 
			
		||||
            },
 | 
			
		||||
          },
 | 
			
		||||
 | 
			
		||||
          {
 | 
			
		||||
            gauss: {
 | 
			
		||||
              last_status_at: {
 | 
			
		||||
                scale: '7d',
 | 
			
		||||
                offset: '14d',
 | 
			
		||||
                decay: 0.5,
 | 
			
		||||
              },
 | 
			
		||||
            },
 | 
			
		||||
          },
 | 
			
		||||
        ],
 | 
			
		||||
 | 
			
		||||
        boost_mode: 'multiply',
 | 
			
		||||
      },
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    filter = {
 | 
			
		||||
      bool: {
 | 
			
		||||
        should: [
 | 
			
		||||
          {
 | 
			
		||||
            term: {
 | 
			
		||||
              reviewed: {
 | 
			
		||||
                value: true,
 | 
			
		||||
              },
 | 
			
		||||
            },
 | 
			
		||||
          },
 | 
			
		||||
 | 
			
		||||
          {
 | 
			
		||||
            term: {
 | 
			
		||||
              name: {
 | 
			
		||||
                value: @query,
 | 
			
		||||
              },
 | 
			
		||||
            },
 | 
			
		||||
          },
 | 
			
		||||
        ],
 | 
			
		||||
      },
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    TagsIndex.query(query).filter(filter).limit(@limit).offset(@offset).objects.compact
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def from_database
 | 
			
		||||
    Tag.search_for(@query, @limit, @offset)
 | 
			
		||||
  end
 | 
			
		||||
end
 | 
			
		||||
@ -142,7 +142,7 @@ en:
 | 
			
		||||
        report: Send e-mail when a new report is submitted
 | 
			
		||||
        trending_tag: Send e-mail when an unreviewed hashtag is trending
 | 
			
		||||
      tag:
 | 
			
		||||
        listable: Allow this hashtag to appear on the profile directory
 | 
			
		||||
        listable: Allow this hashtag to appear in searches and on the profile directory
 | 
			
		||||
        trendable: Allow this hashtag to appear under trends
 | 
			
		||||
        usable: Allow toots to use this hashtag
 | 
			
		||||
    'no': 'No'
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										6
									
								
								db/migrate/20190815225426_add_last_status_at_to_tags.rb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								db/migrate/20190815225426_add_last_status_at_to_tags.rb
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,6 @@
 | 
			
		||||
class AddLastStatusAtToTags < ActiveRecord::Migration[5.2]
 | 
			
		||||
  def change
 | 
			
		||||
    add_column :tags, :last_status_at, :datetime
 | 
			
		||||
    add_column :tags, :last_trend_at, :datetime
 | 
			
		||||
  end
 | 
			
		||||
end
 | 
			
		||||
@ -10,7 +10,7 @@
 | 
			
		||||
#
 | 
			
		||||
# It's strongly recommended that you check this file into your version control system.
 | 
			
		||||
 | 
			
		||||
ActiveRecord::Schema.define(version: 2019_08_07_135426) do
 | 
			
		||||
ActiveRecord::Schema.define(version: 2019_08_15_225426) do
 | 
			
		||||
 | 
			
		||||
  # These are extensions that must be enabled in order to support this database
 | 
			
		||||
  enable_extension "plpgsql"
 | 
			
		||||
@ -667,6 +667,8 @@ ActiveRecord::Schema.define(version: 2019_08_07_135426) do
 | 
			
		||||
    t.boolean "listable"
 | 
			
		||||
    t.datetime "reviewed_at"
 | 
			
		||||
    t.datetime "requested_review_at"
 | 
			
		||||
    t.datetime "last_status_at"
 | 
			
		||||
    t.datetime "last_trend_at"
 | 
			
		||||
    t.index "lower((name)::text)", name: "index_tags_on_name_lower", unique: true
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -136,8 +136,8 @@ RSpec.describe Tag, type: :model do
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    it 'finds the exact matching tag as the first item' do
 | 
			
		||||
      similar_tag = Fabricate(:tag, name: "matchlater", score: 1)
 | 
			
		||||
      tag = Fabricate(:tag, name: "match", score: 1)
 | 
			
		||||
      similar_tag = Fabricate(:tag, name: "matchlater", reviewed_at: Time.now.utc)
 | 
			
		||||
      tag = Fabricate(:tag, name: "match", reviewed_at: Time.now.utc)
 | 
			
		||||
 | 
			
		||||
      results = Tag.search_for("match")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
		Reference in New Issue
	
	Block a user