From fdf819b83e820576164074b6726cb6ffdb4a47f6 Mon Sep 17 00:00:00 2001 From: Jakub Mendyk Date: Sat, 2 Feb 2019 19:01:18 +0100 Subject: [PATCH] Allow most kinds of characters in URL query (fixes #8408) (#8447) * Allow unicode characters in URL query strings Fixes #8408 * Alternative approach to unicode support in urls Adds PoC/idea to approch this problem. --- app/lib/formatter.rb | 39 +++++++++++++++++++++++++++++++++++++- spec/lib/formatter_spec.rb | 32 ++++++++++++++++++++++++++++--- 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb index 05fd9eeb1..2e3587169 100644 --- a/app/lib/formatter.rb +++ b/app/lib/formatter.rb @@ -99,7 +99,7 @@ class Formatter end def encode_and_link_urls(html, accounts = nil, options = {}) - entities = Extractor.extract_entities_with_indices(html, extract_url_without_protocol: false) + entities = utf8_friendly_extractor(html, extract_url_without_protocol: false) if accounts.is_a?(Hash) options = accounts @@ -199,6 +199,43 @@ class Formatter result.flatten.join end + def utf8_friendly_extractor(text, options = {}) + old_to_new_index = [0] + + escaped = text.chars.map do |c| + output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c + old_to_new_index << old_to_new_index.last + output.length + output + end.join + + # Note: I couldn't obtain list_slug with @user/list-name format + # for mention so this requires additional check + special = Extractor.extract_entities_with_indices(escaped, options).map do |extract| + # exactly one of :url, :hashtag, :screen_name, :cashtag keys is present + key = (extract.keys & [:url, :hashtag, :screen_name, :cashtag]).first + + new_indices = [ + old_to_new_index.find_index(extract[:indices].first), + old_to_new_index.find_index(extract[:indices].last), + ] + + has_prefix_char = [:hashtag, :screen_name, :cashtag].include?(key) + value_indices = [ + new_indices.first + (has_prefix_char ? 1 : 0), # account for #, @ or $ + new_indices.last - 1, + ] + + next extract.merge( + :indices => new_indices, + key => text[value_indices.first..value_indices.last] + ) + end + + standard = Extractor.extract_entities_with_indices(text, options) + + Extractor.remove_overlapping_entities(special + standard) + end + def link_to_url(entity, options = {}) url = Addressable::URI.parse(entity[:url]) html_attrs = { target: '_blank', rel: 'nofollow noopener' } diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb index 0c1efe7c3..9872d3756 100644 --- a/spec/lib/formatter_spec.rb +++ b/spec/lib/formatter_spec.rb @@ -74,10 +74,36 @@ RSpec.describe Formatter do end context 'given a URL with a query string' do - let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' } + context 'with escaped unicode character' do + let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' } - it 'matches the full URL' do - is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink"' + it 'matches the full URL' do + is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink"' + end + end + + context 'with unicode character' do + let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓&q=autolink' } + + it 'matches the full URL' do + is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓&q=autolink"' + end + end + + context 'with unicode character at the end' do + let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓' } + + it 'matches the full URL' do + is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓"' + end + end + + context 'with escaped and not escaped unicode characters' do + let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink' } + + it 'preserves escaped unicode characters' do + is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink"' + end end end