Use charlock_holmes instead of nkf at FetchLinkCardService (#4080)

* Specs for language detection

* Use CharlockHolmes instead of NKF

* Correct mistakes

* Correct style

* Set hint_enc instead of falling back and strip_tags

* Improve specs

* Add dependencies
This commit is contained in:
nullkal
2017-07-09 05:44:31 +09:00
committed by Eugen Rochko
parent 794781d121
commit 007ab330e6
11 changed files with 78 additions and 4 deletions

View File

@ -32,6 +32,7 @@ addons:
- g++-6
- libprotobuf-dev
- protobuf-compiler
- libicu-dev
rvm:
- 2.3.4

View File

@ -3,3 +3,4 @@ libprotobuf-dev
ffmpeg
libxdamage1
libxfixes3
libicu-dev

View File

@ -25,6 +25,7 @@ RUN echo "@edge https://nl.alpinelinux.org/alpine/edge/main" >> /etc/apk/reposit
ffmpeg \
file \
git \
icu-dev \
imagemagick@edge \
libpq \
libxml2 \

View File

@ -22,6 +22,7 @@ gem 'active_model_serializers', '~> 0.10'
gem 'addressable', '~> 2.5'
gem 'bootsnap'
gem 'browser'
gem 'charlock_holmes', '~> 0.7.3'
gem 'cld3', '~> 3.1'
gem 'devise', '~> 4.2'
gem 'devise-two-factor', '~> 3.0'

View File

@ -106,6 +106,7 @@ GEM
rack (>= 1.0.0)
rack-test (>= 0.5.4)
xpath (~> 2.0)
charlock_holmes (0.7.3)
case_transform (0.2)
activesupport
chunky_png (1.3.8)
@ -501,6 +502,7 @@ DEPENDENCIES
capistrano-rbenv (~> 2.1)
capistrano-yarn (~> 2.0)
capybara (~> 2.14)
charlock_holmes (~> 0.7.3)
cld3 (~> 3.1)
climate_control (~> 0.2)
devise (~> 4.2)

1
Vagrantfile vendored
View File

@ -37,6 +37,7 @@ sudo apt-get install \
yarn \
libprotobuf-dev \
libreadline-dev \
libicu-dev \
-y
# Install rvm

View File

@ -1,5 +1,4 @@
# frozen_string_literal: true
require 'nkf'
class FetchLinkCardService < BaseService
include HttpHelper
@ -86,7 +85,12 @@ class FetchLinkCardService < BaseService
return if response.code != 200 || response.mime_type != 'text/html'
html = response.to_s
page = Nokogiri::HTML(html, nil, NKF.guess(html).to_s)
detector = CharlockHolmes::EncodingDetector.new
detector.strip_tags = true
guess = detector.detect(html, response.charset)
page = Nokogiri::HTML(html, nil, guess&.fetch(:encoding))
card.type = :link
card.title = meta_property(page, 'og:title') || page.at_xpath('//title')&.content

20
spec/fixtures/requests/koi8-r.txt vendored Normal file
View File

@ -0,0 +1,20 @@
HTTP/1.1 200 OK
Server: nginx/1.11.10
Date: Tue, 04 Jul 2017 16:43:39 GMT
Content-Type: text/html
Content-Length: 273
Connection: keep-alive
Last-Modified: Tue, 04 Jul 2017 16:41:34 GMT
Accept-Ranges: bytes
<HTML>
<HEAD>
<META NAME="GENERATOR" CONTENT="Adobe PageMill 3.0J Mac">
<META HTTP-EQUIV="Content-Type" CONTENT="text/html;CHARSET=koi8-r">
<TITLE><3E><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><> XVI <20><>. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>.</TITLE>
</HEAD>
<BODY>
<P><CENTER><B><FONT SIZE="+2"><3E><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><> XVI <20><>. <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD> <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>.</FONT></B><BR>
<HR><BR>
</BODY>
</HTML>

View File

@ -11,10 +11,10 @@ Accept-Ranges: bytes
<HEAD>
<META NAME="GENERATOR" CONTENT="Adobe PageMill 3.0J Mac">
<META HTTP-EQUIV="Content-Type" CONTENT="text/html;CHARSET=x-sjis">
<TITLE>JSIS<EFBFBD>̃y<EFBFBD>[<5B>W</TITLE>
<TITLE>SJIS<EFBFBD>̃y<EFBFBD>[<5B>W</TITLE>
</HEAD>
<BODY>
<P><CENTER><B><FONT SIZE="+2">SJIS<EFBFBD>̃y<EFBFBD>[<5B>W</FONT></B><BR>
<P><CENTER><B><FONT SIZE="+2"><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>N<EFBFBD>܂<EFBFBD><EFBFBD>Ă<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>L<EFBFBD>O<EFBFBD>l<EFBFBD><EFBFBD><EFBFBD>Ă<EFBFBD><EFBFBD>̂̎<EFBFBD><EFBFBD>ł<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ł<EFBFBD><EFBFBD>B<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ԂɈӖ<EFBFBD><EFBFBD>҂͐<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǂ<EFBFBD><EFBFBD>Ȕ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>܂<EFBFBD><EFBFBD><EFBFBD><EFBFBD>܂ł<EFBFBD><EFBFBD>\<5C><><EFBFBD><EFBFBD><E382B0><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ɂ͎Q<CD8E>l<EFBFBD>A<EFBFBD><EFBFBD><E982BD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>A<EFBFBD><41><EFBFBD><EFBFBD><EFBFBD>ɂ<EFBFBD><C982><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>܂<EFBFBD><DC82>Ȃ<EFBFBD><C882>B<EFBFBD><42><EFBFBD><EFBFBD><EFBFBD><EFBFBD><E782A2><EFBFBD>Ȃ<EFBFBD><C882>̂͂ǂ<CD82><C782><EFBFBD><EFBFBD><EFBFBD><E38C8E><EFBFBD>ł<EFBFBD><C582><EFBFBD><E982BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>B<EFBFBD><42><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ĉ<EFBFBD><C489>c<EFBFBD><63><EFBFBD><EFBFBD><EFBFBD>ɔ<EFBFBD><C994>R<EFBFBD>K<EFBFBD><4B><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ɉ]<5D><><EFBFBD>ł<EFBFBD><C582><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͂<EFBFBD><CD82><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͂<EFBFBD><CD82>Ȃ<EFBFBD><C882><EFBFBD><EFBFBD>w<EFBFBD>}<7D><><EFBFBD>Ƃ<EFBFBD><C682><EFBFBD><EFBFBD><EFBFBD><EFBFBD>o<EFBFBD><6F><EFBFBD><EFBFBD><EFBFBD>Ȃ<EFBFBD><C882><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ȃ<EFBFBD><C882><EFBFBD><EFBFBD>āA<C481><41><EFBFBD>͎̐̂<CC82><CD8E><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͉A<CD89><41><EFBFBD>{<7B><EFBFBD><E782A9><EFBFBD>A<EFBFBD>v<EFBFBD><76><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̂<EFBFBD><CC82>̂<EFBFBD><CC82><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̂<EFBFBD><CC82>‚<EFBFBD><C282><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ɂ<EFBFBD><C982><EFBFBD><EFBFBD>]<5D>ƌ<EFBFBD><C68C><EFBFBD><EFBFBD>΂<EFBFBD><CE82><EFBFBD>man<61>ɂ<EFBFBD><C982><EFBFBD><EFBFBD><EFBFBD><EFBFBD>֎Q<D68E><51><EFBFBD><EFBFBD>ɓ<EFBFBD><C993><EFBFBD><EFBFBD>ɂ<EFBFBD><C982><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>łȂ<C582><C882>̂ŁA<C581><41><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>\<5C><><EFBFBD>ɕς<C995><CF82><EFBFBD><EFBFBD>Ă<EFBFBD><C482><EFBFBD><EFBFBD>ł<EFBFBD><C582><EFBFBD><EFBFBD>ōl<C58D><6C><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>B<EFBFBD><42><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>΂<EFBFBD><CE82><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƃǂ܂<C782><DC82><EFBFBD><EFBFBD>̂<EFBFBD><CC82><EFBFBD><EFBFBD>ۂނ<DB82><DE82>݂Ƃ<DD82><C682><EFBFBD><EFBFBD>ł<EFBFBD><C582>āA<C481><41><EFBFBD>̎<EFBFBD><CC8E><EFBFBD><EFBFBD>ł͐\<5C><><EFBFBD><EFBFBD><EFBFBD>ĂƂ<C482><C682>Đ<EFBFBD><C490>Ԃɕ<D482><C995>ׂ̂ɍs<C98D><73><EFBFBD>Ȃ<EFBFBD><C882><EFBFBD><EFBFBD>ȁB</FONT></B><BR>
<HR><BR>
</BODY>
</HTML>

View File

@ -0,0 +1,20 @@
HTTP/1.1 200 OK
Server: nginx/1.11.10
Date: Tue, 04 Jul 2017 16:43:39 GMT
Content-Type: text/html; charset=utf-8
Content-Length: 273
Connection: keep-alive
Last-Modified: Tue, 04 Jul 2017 16:41:34 GMT
Accept-Ranges: bytes
<HTML>
<HEAD>
<META NAME="GENERATOR" CONTENT="Adobe PageMill 3.0J Mac">
<META HTTP-EQUIV="Content-Type" CONTENT="text/html;CHARSET=x-sjis">
<TITLE>SJIS<49>̃y<CC83>[<5B>W</TITLE>
</HEAD>
<BODY>
<P><CENTER><B><FONT SIZE="+2"><3E><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>N<EFBFBD>܂<EFBFBD><DC82>Ă<EFBFBD><C482><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>L<EFBFBD>O<EFBFBD>l<EFBFBD><6C><EFBFBD>Ă<EFBFBD><C482>̂̎<CC82><CC8E>ł<EFBFBD><C582><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ł<EFBFBD><C582>B<EFBFBD><42><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ԂɈӖ<C988><D396>҂͐<D282><CD90><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǂ<EFBFBD><C782>Ȕ<EFBFBD><C894><EFBFBD><EFBFBD>܂<EFBFBD><DC82><EFBFBD><EFBFBD>܂ł<DC82><C582>\<5C><><EFBFBD><EFBFBD><E382B0><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ɂ͎Q<CD8E>l<EFBFBD>A<EFBFBD><EFBFBD><E982BD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>A<EFBFBD><41><EFBFBD><EFBFBD><EFBFBD>ɂ<EFBFBD><C982><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>܂<EFBFBD><DC82>Ȃ<EFBFBD><C882>B<EFBFBD><42><EFBFBD><EFBFBD><EFBFBD><EFBFBD><E782A2><EFBFBD>Ȃ<EFBFBD><C882>̂͂ǂ<CD82><C782><EFBFBD><EFBFBD><EFBFBD><E38C8E><EFBFBD>ł<EFBFBD><C582><EFBFBD><E982BE><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>B<EFBFBD><42><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ĉ<EFBFBD><C489>c<EFBFBD><63><EFBFBD><EFBFBD><EFBFBD>ɔ<EFBFBD><C994>R<EFBFBD>K<EFBFBD><4B><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ɉ]<5D><><EFBFBD>ł<EFBFBD><C582><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͂<EFBFBD><CD82><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͂<EFBFBD><CD82>Ȃ<EFBFBD><C882><EFBFBD><EFBFBD>w<EFBFBD>}<7D><><EFBFBD>Ƃ<EFBFBD><C682><EFBFBD><EFBFBD><EFBFBD><EFBFBD>o<EFBFBD><6F><EFBFBD><EFBFBD><EFBFBD>Ȃ<EFBFBD><C882><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ȃ<EFBFBD><C882><EFBFBD><EFBFBD>āA<C481><41><EFBFBD>͎̐̂<CC82><CD8E><EFBFBD><EFBFBD><EFBFBD><EFBFBD>͉A<CD89><41><EFBFBD>{<7B><EFBFBD><E782A9><EFBFBD>A<EFBFBD>v<EFBFBD><76><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̂<EFBFBD><CC82>̂<EFBFBD><CC82><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̂<EFBFBD><CC82>‚<EFBFBD><C282><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ɂ<EFBFBD><C982><EFBFBD><EFBFBD>]<5D>ƌ<EFBFBD><C68C><EFBFBD><EFBFBD>΂<EFBFBD><CE82><EFBFBD>man<61>ɂ<EFBFBD><C982><EFBFBD><EFBFBD><EFBFBD><EFBFBD>֎Q<D68E><51><EFBFBD><EFBFBD>ɓ<EFBFBD><C993><EFBFBD><EFBFBD>ɂ<EFBFBD><C982><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>łȂ<C582><C882>̂ŁA<C581><41><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>\<5C><><EFBFBD>ɕς<C995><CF82><EFBFBD><EFBFBD>Ă<EFBFBD><C482><EFBFBD><EFBFBD>ł<EFBFBD><C582><EFBFBD><EFBFBD>ōl<C58D><6C><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>B<EFBFBD><42><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>΂<EFBFBD><CE82><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ƃǂ܂<C782><DC82><EFBFBD><EFBFBD>̂<EFBFBD><CC82><EFBFBD><EFBFBD>ۂނ<DB82><DE82>݂Ƃ<DD82><C682><EFBFBD><EFBFBD>ł<EFBFBD><C582>āA<C481><41><EFBFBD>̎<EFBFBD><CC8E><EFBFBD><EFBFBD>ł͐\<5C><><EFBFBD><EFBFBD><EFBFBD>ĂƂ<C482><C682>Đ<EFBFBD><C490>Ԃɕ<D482><C995>ׂ̂ɍs<C98D><73><EFBFBD>Ȃ<EFBFBD><C882><EFBFBD><EFBFBD>ȁB</FONT></B><BR>
<HR><BR>
</BODY>
</HTML>

View File

@ -8,6 +8,10 @@ RSpec.describe FetchLinkCardService do
stub_request(:get, 'http://example.xn--fiqs8s/').to_return(request_fixture('idn.txt'))
stub_request(:head, 'http://example.com/sjis').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
stub_request(:get, 'http://example.com/sjis').to_return(request_fixture('sjis.txt'))
stub_request(:head, 'http://example.com/sjis_with_wrong_charset').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
stub_request(:get, 'http://example.com/sjis_with_wrong_charset').to_return(request_fixture('sjis_with_wrong_charset.txt'))
stub_request(:head, 'http://example.com/koi8-r').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
stub_request(:get, 'http://example.com/koi8-r').to_return(request_fixture('koi8-r.txt'))
stub_request(:head, 'https://github.com/qbi/WannaCry').to_return(status: 404)
subject.call(status)
@ -27,6 +31,25 @@ RSpec.describe FetchLinkCardService do
it 'works with SJIS' do
expect(a_request(:get, 'http://example.com/sjis')).to have_been_made.at_least_once
expect(status.preview_card.title).to eq("SJISのページ")
end
end
context do
let(:status) { Fabricate(:status, text: 'Check out http://example.com/sjis_with_wrong_charset') }
it 'works with SJIS even with wrong charset header' do
expect(a_request(:get, 'http://example.com/sjis_with_wrong_charset')).to have_been_made.at_least_once
expect(status.preview_card.title).to eq("SJISのページ")
end
end
context do
let(:status) { Fabricate(:status, text: 'Check out http://example.com/koi8-r') }
it 'works with koi8-r' do
expect(a_request(:get, 'http://example.com/koi8-r')).to have_been_made.at_least_once
expect(status.preview_card.title).to eq("Московя начинаетъ только въ XVI ст. привлекать внимане иностранцевъ.")
end
end
end