* Allow unicode characters in URL query strings Fixes #8408 * Alternative approach to unicode support in urls Adds PoC/idea to approch this problem.
This commit is contained in:
parent
687a0cbcb0
commit
fdf819b83e
@ -99,7 +99,7 @@ class Formatter
|
|||||||
end
|
end
|
||||||
|
|
||||||
def encode_and_link_urls(html, accounts = nil, options = {})
|
def encode_and_link_urls(html, accounts = nil, options = {})
|
||||||
entities = Extractor.extract_entities_with_indices(html, extract_url_without_protocol: false)
|
entities = utf8_friendly_extractor(html, extract_url_without_protocol: false)
|
||||||
|
|
||||||
if accounts.is_a?(Hash)
|
if accounts.is_a?(Hash)
|
||||||
options = accounts
|
options = accounts
|
||||||
@ -199,6 +199,43 @@ class Formatter
|
|||||||
result.flatten.join
|
result.flatten.join
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def utf8_friendly_extractor(text, options = {})
|
||||||
|
old_to_new_index = [0]
|
||||||
|
|
||||||
|
escaped = text.chars.map do |c|
|
||||||
|
output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c
|
||||||
|
old_to_new_index << old_to_new_index.last + output.length
|
||||||
|
output
|
||||||
|
end.join
|
||||||
|
|
||||||
|
# Note: I couldn't obtain list_slug with @user/list-name format
|
||||||
|
# for mention so this requires additional check
|
||||||
|
special = Extractor.extract_entities_with_indices(escaped, options).map do |extract|
|
||||||
|
# exactly one of :url, :hashtag, :screen_name, :cashtag keys is present
|
||||||
|
key = (extract.keys & [:url, :hashtag, :screen_name, :cashtag]).first
|
||||||
|
|
||||||
|
new_indices = [
|
||||||
|
old_to_new_index.find_index(extract[:indices].first),
|
||||||
|
old_to_new_index.find_index(extract[:indices].last),
|
||||||
|
]
|
||||||
|
|
||||||
|
has_prefix_char = [:hashtag, :screen_name, :cashtag].include?(key)
|
||||||
|
value_indices = [
|
||||||
|
new_indices.first + (has_prefix_char ? 1 : 0), # account for #, @ or $
|
||||||
|
new_indices.last - 1,
|
||||||
|
]
|
||||||
|
|
||||||
|
next extract.merge(
|
||||||
|
:indices => new_indices,
|
||||||
|
key => text[value_indices.first..value_indices.last]
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
|
standard = Extractor.extract_entities_with_indices(text, options)
|
||||||
|
|
||||||
|
Extractor.remove_overlapping_entities(special + standard)
|
||||||
|
end
|
||||||
|
|
||||||
def link_to_url(entity, options = {})
|
def link_to_url(entity, options = {})
|
||||||
url = Addressable::URI.parse(entity[:url])
|
url = Addressable::URI.parse(entity[:url])
|
||||||
html_attrs = { target: '_blank', rel: 'nofollow noopener' }
|
html_attrs = { target: '_blank', rel: 'nofollow noopener' }
|
||||||
|
@ -74,6 +74,7 @@ RSpec.describe Formatter do
|
|||||||
end
|
end
|
||||||
|
|
||||||
context 'given a URL with a query string' do
|
context 'given a URL with a query string' do
|
||||||
|
context 'with escaped unicode character' do
|
||||||
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }
|
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }
|
||||||
|
|
||||||
it 'matches the full URL' do
|
it 'matches the full URL' do
|
||||||
@ -81,6 +82,31 @@ RSpec.describe Formatter do
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context 'with unicode character' do
|
||||||
|
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓&q=autolink' }
|
||||||
|
|
||||||
|
it 'matches the full URL' do
|
||||||
|
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓&q=autolink"'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'with unicode character at the end' do
|
||||||
|
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓' }
|
||||||
|
|
||||||
|
it 'matches the full URL' do
|
||||||
|
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓"'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context 'with escaped and not escaped unicode characters' do
|
||||||
|
let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink' }
|
||||||
|
|
||||||
|
it 'preserves escaped unicode characters' do
|
||||||
|
is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink"'
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
context 'given a URL with parentheses in it' do
|
context 'given a URL with parentheses in it' do
|
||||||
let(:text) { 'https://en.wikipedia.org/wiki/Diaspora_(software)' }
|
let(:text) { 'https://en.wikipedia.org/wiki/Diaspora_(software)' }
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user