From ad207456d64c76d21c17b26a954b459fe2dc0f54 Mon Sep 17 00:00:00 2001
From: "Renato \"Lond\" Cerqueira"
Date: Thu, 16 Nov 2017 10:51:38 -0200
Subject: [PATCH] Improve language filter (#5724)
* Scrub text of html before detecting language.
* Detect language on statuses coming from activitypub.
* Fix rubocop comments.
* Remove custom emoji from text before language detection
---
app/lib/activitypub/activity/create.rb | 2 +-
app/lib/language_detector.rb | 31 +++++++++++++++++++++-----
2 files changed, 26 insertions(+), 7 deletions(-)
diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb
index 376684c006..66e4f7c5ec 100644
--- a/app/lib/activitypub/activity/create.rb
+++ b/app/lib/activitypub/activity/create.rb
@@ -173,7 +173,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity
end
def language_from_content
- return nil unless language_map?
+ return LanguageDetector.instance.detect(text_from_content, @account) unless language_map?
@object['contentMap'].keys.first
end
diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb
index a42460e109..c6f52f0c7e 100644
--- a/app/lib/language_detector.rb
+++ b/app/lib/language_detector.rb
@@ -38,12 +38,31 @@ class LanguageDetector
end
def simplify_text(text)
- text.dup.tap do |new_text|
- new_text.gsub!(FetchLinkCardService::URL_PATTERN, '')
- new_text.gsub!(Account::MENTION_RE, '')
- new_text.gsub!(Tag::HASHTAG_RE, '')
- new_text.gsub!(/\s+/, ' ')
- end
+ new_text = remove_html(text)
+ new_text.gsub!(FetchLinkCardService::URL_PATTERN, '')
+ new_text.gsub!(Account::MENTION_RE, '')
+ new_text.gsub!(Tag::HASHTAG_RE, '')
+ new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '')
+ new_text.gsub!(/\s+/, ' ')
+ new_text
+ end
+
+ def new_scrubber
+ scrubber = Rails::Html::PermitScrubber.new
+ scrubber.tags = %w(br p)
+ scrubber
+ end
+
+ def scrubber
+ @scrubber ||= new_scrubber
+ end
+
+ def remove_html(text)
+ text = Loofah.fragment(text).scrub!(scrubber).to_s
+ text.gsub!('
', "\n")
+ text.gsub!('
', "\n\n")
+ text.gsub!(/(^
|<\/p>$)/, '')
+ text
end
def default_locale(account)