From 0d2cf3cd4a73ffcf0dfba24ea38be2e36528a4b7 Mon Sep 17 00:00:00 2001 From: Claire Date: Mon, 7 Feb 2022 13:14:48 +0100 Subject: [PATCH 1/7] Fix errors when multiple Delete are received for a given actor (#17460) --- app/workers/activitypub/processing_worker.rb | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/app/workers/activitypub/processing_worker.rb b/app/workers/activitypub/processing_worker.rb index cef595319..37e316354 100644 --- a/app/workers/activitypub/processing_worker.rb +++ b/app/workers/activitypub/processing_worker.rb @@ -6,7 +6,10 @@ class ActivityPub::ProcessingWorker sidekiq_options backtrace: true, retry: 8 def perform(account_id, body, delivered_to_account_id = nil) - ActivityPub::ProcessCollectionService.new.call(body, Account.find(account_id), override_timestamps: true, delivered_to_account_id: delivered_to_account_id, delivery: true) + account = Account.find_by(id: account_id) + return if account.nil? + + ActivityPub::ProcessCollectionService.new.call(body, account, override_timestamps: true, delivered_to_account_id: delivered_to_account_id, delivery: true) rescue ActiveRecord::RecordInvalid => e Rails.logger.debug "Error processing incoming ActivityPub object: #{e}" end From 73a782391ca3bc5cbb24fb98065f6a5f4d64f22c Mon Sep 17 00:00:00 2001 From: Claire Date: Mon, 7 Feb 2022 17:06:43 +0100 Subject: [PATCH 2/7] Fix replies collection incorrectly looping (#17462) * Refactor tests * Add tests * Fix replies collection incorrectly looping --- .../activitypub/replies_controller.rb | 30 +- .../activitypub/replies_controller_spec.rb | 351 ++++++++---------- 2 files changed, 180 insertions(+), 201 deletions(-) diff --git a/app/controllers/activitypub/replies_controller.rb b/app/controllers/activitypub/replies_controller.rb index fde6c861f..4ff7cfa08 100644 --- a/app/controllers/activitypub/replies_controller.rb +++ b/app/controllers/activitypub/replies_controller.rb @@ -63,15 +63,29 @@ class ActivityPub::RepliesController < ActivityPub::BaseController end def next_page - only_other_accounts = !(@replies&.last&.account_id == @account.id && @replies.size == DESCENDANTS_LIMIT) + if only_other_accounts? + # Only consider remote accounts + return nil if @replies.size < DESCENDANTS_LIMIT - account_status_replies_url( - @account, - @status, - page: true, - min_id: only_other_accounts && !only_other_accounts? ? nil : @replies&.last&.id, - only_other_accounts: only_other_accounts - ) + account_status_replies_url( + @account, + @status, + page: true, + min_id: @replies&.last&.id, + only_other_accounts: true + ) + else + # For now, we're serving only self-replies, but next page might be other accounts + next_only_other_accounts = @replies&.last&.account_id != @account.id || @replies.size < DESCENDANTS_LIMIT + + account_status_replies_url( + @account, + @status, + page: true, + min_id: next_only_other_accounts ? nil : @replies&.last&.id, + only_other_accounts: next_only_other_accounts + ) + end end def page_params diff --git a/spec/controllers/activitypub/replies_controller_spec.rb b/spec/controllers/activitypub/replies_controller_spec.rb index bf82fd020..a2c7f336f 100644 --- a/spec/controllers/activitypub/replies_controller_spec.rb +++ b/spec/controllers/activitypub/replies_controller_spec.rb @@ -4,8 +4,9 @@ require 'rails_helper' RSpec.describe ActivityPub::RepliesController, type: :controller do let(:status) { Fabricate(:status, visibility: parent_visibility) } - let(:remote_reply_id) { nil } - let(:remote_account) { nil } + let(:remote_account) { Fabricate(:account, domain: 'foobar.com') } + let(:remote_reply_id) { 'https://foobar.com/statuses/1234' } + let(:remote_querier) { nil } shared_examples 'cachable response' do it 'does not set cookies' do @@ -23,8 +24,151 @@ RSpec.describe ActivityPub::RepliesController, type: :controller do end end + shared_examples 'common behavior' do + context 'when status is private' do + let(:parent_visibility) { :private } + + it 'returns http not found' do + expect(response).to have_http_status(404) + end + end + + context 'when status is direct' do + let(:parent_visibility) { :direct } + + it 'returns http not found' do + expect(response).to have_http_status(404) + end + end + end + + shared_examples 'disallowed access' do + context 'when status is public' do + let(:parent_visibility) { :public } + + it 'returns http not found' do + expect(response).to have_http_status(404) + end + end + + it_behaves_like 'common behavior' + end + + shared_examples 'allowed access' do + context 'when account is permanently suspended' do + let(:parent_visibility) { :public } + + before do + status.account.suspend! + status.account.deletion_request.destroy + end + + it 'returns http gone' do + expect(response).to have_http_status(410) + end + end + + context 'when account is temporarily suspended' do + let(:parent_visibility) { :public } + + before do + status.account.suspend! + end + + it 'returns http forbidden' do + expect(response).to have_http_status(403) + end + end + + context 'when status is public' do + let(:parent_visibility) { :public } + let(:json) { body_as_json } + let(:page_json) { json[:first] } + + it 'returns http success' do + expect(response).to have_http_status(200) + end + + it 'returns application/activity+json' do + expect(response.media_type).to eq 'application/activity+json' + end + + it_behaves_like 'cachable response' + + context 'without only_other_accounts' do + it "returns items with thread author's replies" do + expect(page_json).to be_a Hash + expect(page_json[:items]).to be_an Array + expect(page_json[:items].size).to eq 1 + expect(page_json[:items].all? { |item| item[:to].include?(ActivityPub::TagManager::COLLECTIONS[:public]) || item[:cc].include?(ActivityPub::TagManager::COLLECTIONS[:public]) }).to be true + end + + context 'when there are few self-replies' do + it 'points next to replies from other people' do + expect(page_json).to be_a Hash + expect(Addressable::URI.parse(page_json[:next]).query.split('&')).to include('only_other_accounts=true', 'page=true') + end + end + + context 'when there are many self-replies' do + before do + 10.times { Fabricate(:status, account: status.account, thread: status, visibility: :public) } + end + + it 'points next to other self-replies' do + expect(page_json).to be_a Hash + expect(Addressable::URI.parse(page_json[:next]).query.split('&')).to include('only_other_accounts=false', 'page=true') + end + end + end + + context 'with only_other_accounts' do + let(:only_other_accounts) { 'true' } + + it 'returns items with other public or unlisted replies' do + expect(page_json).to be_a Hash + expect(page_json[:items]).to be_an Array + expect(page_json[:items].size).to eq 3 + end + + it 'only inlines items that are local and public or unlisted replies' do + inlined_replies = page_json[:items].select { |x| x.is_a?(Hash) } + public_collection = ActivityPub::TagManager::COLLECTIONS[:public] + expect(inlined_replies.all? { |item| item[:to].include?(public_collection) || item[:cc].include?(public_collection) }).to be true + expect(inlined_replies.all? { |item| ActivityPub::TagManager.instance.local_uri?(item[:id]) }).to be true + end + + it 'uses ids for remote toots' do + remote_replies = page_json[:items].select { |x| !x.is_a?(Hash) } + expect(remote_replies.all? { |item| item.is_a?(String) && !ActivityPub::TagManager.instance.local_uri?(item) }).to be true + end + + context 'when there are few replies' do + it 'does not have a next page' do + expect(page_json).to be_a Hash + expect(page_json[:next]).to be_nil + end + end + + context 'when there are many replies' do + before do + 10.times { Fabricate(:status, thread: status, visibility: :public) } + end + + it 'points next to other replies' do + expect(page_json).to be_a Hash + expect(Addressable::URI.parse(page_json[:next]).query.split('&')).to include('only_other_accounts=true', 'page=true') + end + end + end + end + + it_behaves_like 'common behavior' + end + before do - allow(controller).to receive(:signed_request_account).and_return(remote_account) + stub_const 'ActivityPub::RepliesController::DESCENDANTS_LIMIT', 5 + allow(controller).to receive(:signed_request_account).and_return(remote_querier) Fabricate(:status, thread: status, visibility: :public) Fabricate(:status, thread: status, visibility: :public) @@ -32,215 +176,36 @@ RSpec.describe ActivityPub::RepliesController, type: :controller do Fabricate(:status, account: status.account, thread: status, visibility: :public) Fabricate(:status, account: status.account, thread: status, visibility: :private) - Fabricate(:status, account: remote_account, thread: status, visibility: :public, uri: remote_reply_id) if remote_reply_id + Fabricate(:status, account: remote_account, thread: status, visibility: :public, uri: remote_reply_id) end describe 'GET #index' do + subject(:response) { get :index, params: { account_username: status.account.username, status_id: status.id, only_other_accounts: only_other_accounts } } + let(:only_other_accounts) { nil } + context 'with no signature' do - subject(:response) { get :index, params: { account_username: status.account.username, status_id: status.id } } - subject(:body) { body_as_json } - - context 'when account is permanently suspended' do - let(:parent_visibility) { :public } - - before do - status.account.suspend! - status.account.deletion_request.destroy - end - - it 'returns http gone' do - expect(response).to have_http_status(410) - end - end - - context 'when account is temporarily suspended' do - let(:parent_visibility) { :public } - - before do - status.account.suspend! - end - - it 'returns http forbidden' do - expect(response).to have_http_status(403) - end - end - - context 'when status is public' do - let(:parent_visibility) { :public } - - it 'returns http success' do - expect(response).to have_http_status(200) - end - - it 'returns application/activity+json' do - expect(response.media_type).to eq 'application/activity+json' - end - - it_behaves_like 'cachable response' - - it 'returns items with account\'s own replies' do - expect(body[:first]).to be_a Hash - expect(body[:first][:items]).to be_an Array - expect(body[:first][:items].size).to eq 1 - expect(body[:first][:items].all? { |item| item[:to].include?(ActivityPub::TagManager::COLLECTIONS[:public]) || item[:cc].include?(ActivityPub::TagManager::COLLECTIONS[:public]) }).to be true - end - end - - context 'when status is private' do - let(:parent_visibility) { :private } - - it 'returns http not found' do - expect(response).to have_http_status(404) - end - end - - context 'when status is direct' do - let(:parent_visibility) { :direct } - - it 'returns http not found' do - expect(response).to have_http_status(404) - end - end + it_behaves_like 'allowed access' end context 'with signature' do - let(:remote_account) { Fabricate(:account, domain: 'example.com') } - let(:only_other_accounts) { nil } + let(:remote_querier) { Fabricate(:account, domain: 'example.com') } - context do - before do - get :index, params: { account_username: status.account.username, status_id: status.id, only_other_accounts: only_other_accounts } - end - - context 'when status is public' do - let(:parent_visibility) { :public } - - it 'returns http success' do - expect(response).to have_http_status(200) - end - - it 'returns application/activity+json' do - expect(response.media_type).to eq 'application/activity+json' - end - - it_behaves_like 'cachable response' - - context 'without only_other_accounts' do - it 'returns items with account\'s own replies' do - json = body_as_json - - expect(json[:first]).to be_a Hash - expect(json[:first][:items]).to be_an Array - expect(json[:first][:items].size).to eq 1 - expect(json[:first][:items].all? { |item| item[:to].include?(ActivityPub::TagManager::COLLECTIONS[:public]) || item[:cc].include?(ActivityPub::TagManager::COLLECTIONS[:public]) }).to be true - end - end - - context 'with only_other_accounts' do - let(:only_other_accounts) { 'true' } - - it 'returns items with other public or unlisted replies' do - json = body_as_json - - expect(json[:first]).to be_a Hash - expect(json[:first][:items]).to be_an Array - expect(json[:first][:items].size).to eq 2 - expect(json[:first][:items].all? { |item| item[:to].include?(ActivityPub::TagManager::COLLECTIONS[:public]) || item[:cc].include?(ActivityPub::TagManager::COLLECTIONS[:public]) }).to be true - end - - context 'with remote responses' do - let(:remote_reply_id) { 'foo' } - - it 'returned items are all inlined local toots or are ids' do - json = body_as_json - - expect(json[:first]).to be_a Hash - expect(json[:first][:items]).to be_an Array - expect(json[:first][:items].size).to eq 3 - expect(json[:first][:items].all? { |item| item.is_a?(Hash) ? ActivityPub::TagManager.instance.local_uri?(item[:id]) : item.is_a?(String) }).to be true - expect(json[:first][:items]).to include remote_reply_id - end - end - end - end - - context 'when status is private' do - let(:parent_visibility) { :private } - - it 'returns http not found' do - expect(response).to have_http_status(404) - end - end - - context 'when status is direct' do - let(:parent_visibility) { :direct } - - it 'returns http not found' do - expect(response).to have_http_status(404) - end - end - end + it_behaves_like 'allowed access' context 'when signed request account is blocked' do before do - status.account.block!(remote_account) - get :index, params: { account_username: status.account.username, status_id: status.id } + status.account.block!(remote_querier) end - context 'when status is public' do - let(:parent_visibility) { :public } - - it 'returns http not found' do - expect(response).to have_http_status(404) - end - end - - context 'when status is private' do - let(:parent_visibility) { :private } - - it 'returns http not found' do - expect(response).to have_http_status(404) - end - end - - context 'when status is direct' do - let(:parent_visibility) { :direct } - - it 'returns http not found' do - expect(response).to have_http_status(404) - end - end + it_behaves_like 'disallowed access' end context 'when signed request account is domain blocked' do before do - status.account.block_domain!(remote_account.domain) - get :index, params: { account_username: status.account.username, status_id: status.id } + status.account.block_domain!(remote_querier.domain) end - context 'when status is public' do - let(:parent_visibility) { :public } - - it 'returns http not found' do - expect(response).to have_http_status(404) - end - end - - context 'when status is private' do - let(:parent_visibility) { :private } - - it 'returns http not found' do - expect(response).to have_http_status(404) - end - end - - context 'when status is direct' do - let(:parent_visibility) { :direct } - - it 'returns http not found' do - expect(response).to have_http_status(404) - end - end + it_behaves_like 'disallowed access' end end end From f1f6ddd5362f40e287857750f5e102206bd0e169 Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Mon, 7 Feb 2022 18:16:31 +0100 Subject: [PATCH 3/7] Fix structured data parsing from links choking on bad data (#17403) * Fix structured data parsing from links choking on bad data - Fix og:url meta tag being prioritized over canonical link tag - Fix structured data parsing choking on commented-out CDATA declarations - Fix HTML entities in title, description, provider_name, author_name - Change structured data parsing to attempt every JSON-LD script tag * Remove unnecessary slash escapes from CDATA regex pattern --- app/lib/link_details_extractor.rb | 53 ++++++++-- spec/lib/link_details_extractor_spec.rb | 122 ++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 9 deletions(-) diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index 56ad0717b..d2bcf0c25 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -3,6 +3,19 @@ class LinkDetailsExtractor include ActionView::Helpers::TagHelper + # Some publications wrap their JSON-LD data in their + + + HTML + + describe '#title' do + it 'returns the title from structured data' do + expect(subject.title).to eq 'Foo' + end + end + + describe '#description' do + it 'returns the description from structured data' do + expect(subject.description).to eq 'Bar' + end + end + + describe '#provider_name' do + it 'returns the provider name from structured data' do + expect(subject.provider_name).to eq 'Baz' + end + end + + describe '#author_name' do + it 'returns the author name from structured data' do + expect(subject.author_name).to eq 'Hoge' + end + end + end + + context 'but the first tag is invalid JSON' do + let(:html) { <<-HTML } + + + + + + + + HTML + + describe '#title' do + it 'returns the title from structured data' do + expect(subject.title).to eq 'Foo' + end + end + + describe '#description' do + it 'returns the description from structured data' do + expect(subject.description).to eq 'Bar' + end + end + + describe '#provider_name' do + it 'returns the provider name from structured data' do + expect(subject.provider_name).to eq 'Baz' + end + end + + describe '#author_name' do + it 'returns the author name from structured data' do + expect(subject.author_name).to eq 'Hoge' + end + end + end + end end From 52c1b86964caddb99e01ff36e928a524bf66ec0e Mon Sep 17 00:00:00 2001 From: Claire Date: Mon, 7 Feb 2022 19:57:06 +0100 Subject: [PATCH 4/7] Fix Ruby 2.5 incompatibility (#17465) --- app/workers/scheduler/accounts_statuses_cleanup_scheduler.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/workers/scheduler/accounts_statuses_cleanup_scheduler.rb b/app/workers/scheduler/accounts_statuses_cleanup_scheduler.rb index f42d4bca6..7195f0ff9 100644 --- a/app/workers/scheduler/accounts_statuses_cleanup_scheduler.rb +++ b/app/workers/scheduler/accounts_statuses_cleanup_scheduler.rb @@ -66,7 +66,7 @@ class Scheduler::AccountsStatusesCleanupScheduler end def compute_budget - threads = Sidekiq::ProcessSet.new.filter { |x| x['queues'].include?('push') }.map { |x| x['concurrency'] }.sum + threads = Sidekiq::ProcessSet.new.select { |x| x['queues'].include?('push') }.map { |x| x['concurrency'] }.sum [PER_THREAD_BUDGET * threads, MAX_BUDGET].min end From 35850f8195b633c60215461ebde48d2e8725fbd2 Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Tue, 8 Feb 2022 01:53:49 +0100 Subject: [PATCH 5/7] Fix localization of cold-start follow recommendations (#17479) --- .../account_suggestions/global_source.rb | 2 +- .../follow_recommendations/show.html.haml | 2 +- .../follow_recommendations_scheduler.rb | 29 +++++++++++-------- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/app/models/account_suggestions/global_source.rb b/app/models/account_suggestions/global_source.rb index ac764de50..03ed1b6c2 100644 --- a/app/models/account_suggestions/global_source.rb +++ b/app/models/account_suggestions/global_source.rb @@ -6,7 +6,7 @@ class AccountSuggestions::GlobalSource < AccountSuggestions::Source end def get(account, skip_account_ids: [], limit: 40) - account_ids = account_ids_for_locale(account.user_locale) - [account.id] - skip_account_ids + account_ids = account_ids_for_locale(I18n.locale.to_str.split(/[_-]/).first) - [account.id] - skip_account_ids as_ordered_suggestions( scope(account).where(id: account_ids), diff --git a/app/views/admin/follow_recommendations/show.html.haml b/app/views/admin/follow_recommendations/show.html.haml index 5b949a165..48c1ad1f2 100644 --- a/app/views/admin/follow_recommendations/show.html.haml +++ b/app/views/admin/follow_recommendations/show.html.haml @@ -13,7 +13,7 @@ .filter-subset.filter-subset--with-select %strong= t('admin.follow_recommendations.language') .input.select.optional - = select_tag :language, options_for_select(I18n.available_locales.map { |key| [human_locale(key), key]}, @language) + = select_tag :language, options_for_select(I18n.available_locales.map { |key| key.to_s.split(/[_-]/).first.to_sym }.uniq.map { |key| [human_locale(key), key]}, @language) .filter-subset %strong= t('admin.follow_recommendations.status') diff --git a/app/workers/scheduler/follow_recommendations_scheduler.rb b/app/workers/scheduler/follow_recommendations_scheduler.rb index effc63e59..084619cbd 100644 --- a/app/workers/scheduler/follow_recommendations_scheduler.rb +++ b/app/workers/scheduler/follow_recommendations_scheduler.rb @@ -16,28 +16,33 @@ class Scheduler::FollowRecommendationsScheduler AccountSummary.refresh FollowRecommendation.refresh - fallback_recommendations = FollowRecommendation.order(rank: :desc).limit(SET_SIZE).index_by(&:account_id) + fallback_recommendations = FollowRecommendation.order(rank: :desc).limit(SET_SIZE) - I18n.available_locales.each do |locale| + I18n.available_locales.map { |locale| locale.to_s.split(/[_-]/).first }.uniq.each do |locale| recommendations = begin if AccountSummary.safe.filtered.localized(locale).exists? # We can skip the work if no accounts with that language exist - FollowRecommendation.localized(locale).order(rank: :desc).limit(SET_SIZE).index_by(&:account_id) + FollowRecommendation.localized(locale).order(rank: :desc).limit(SET_SIZE).map { |recommendation| [recommendation.account_id, recommendation.rank] } else - {} + [] end end # Use language-agnostic results if there are not enough language-specific ones - missing = SET_SIZE - recommendations.keys.size + missing = SET_SIZE - recommendations.size + + if missing.positive? && fallback_recommendations.size.positive? + max_fallback_rank = fallback_recommendations.first.rank || 0 + + # Language-specific results should be above language-agnostic ones, + # otherwise language-agnostic ones will always overshadow them + recommendations.map! { |(account_id, rank)| [account_id, rank + max_fallback_rank] } - if missing.positive? added = 0 - # Avoid duplicate results - fallback_recommendations.each_value do |recommendation| - next if recommendations.key?(recommendation.account_id) + fallback_recommendations.each do |recommendation| + next if recommendations.any? { |(account_id, _)| account_id == recommendation.account_id } - recommendations[recommendation.account_id] = recommendation + recommendations << [recommendation.account_id, recommendation.rank] added += 1 break if added >= missing @@ -47,8 +52,8 @@ class Scheduler::FollowRecommendationsScheduler redis.pipelined do redis.del(key(locale)) - recommendations.each_value do |recommendation| - redis.zadd(key(locale), recommendation.rank, recommendation.account_id) + recommendations.each do |(account_id, rank)| + redis.zadd(key(locale), rank, account_id) end end end From 85b86fe28c62b8c3b34de20a292b158526355ddd Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Tue, 8 Feb 2022 02:34:56 +0100 Subject: [PATCH 6/7] Add global `locale` param (#17464) - Remove the session-based locale stickyness --- app/controllers/concerns/localized.rb | 29 ++++++++++++--------------- config/application.rb | 11 ++++++---- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/app/controllers/concerns/localized.rb b/app/controllers/concerns/localized.rb index fe1142f34..f7b62f09c 100644 --- a/app/controllers/concerns/localized.rb +++ b/app/controllers/concerns/localized.rb @@ -7,27 +7,24 @@ module Localized around_action :set_locale end - def set_locale - locale = current_user.locale if respond_to?(:user_signed_in?) && user_signed_in? - locale ||= session[:locale] ||= default_locale - locale = default_locale unless I18n.available_locales.include?(locale.to_sym) - - I18n.with_locale(locale) do - yield - end + def set_locale(&block) + I18n.with_locale(requested_locale || I18n.default_locale, &block) end private - def default_locale - if ENV['DEFAULT_LOCALE'].present? - I18n.default_locale - else - request_locale || I18n.default_locale - end + def requested_locale + requested_locale_name = available_locale_or_nil(params[:locale]) + requested_locale_name ||= available_locale_or_nil(current_user.locale) if respond_to?(:user_signed_in?) && user_signed_in? + requested_locale_name ||= http_accept_language if ENV['DEFAULT_LOCALE'].blank? + requested_locale_name end - def request_locale - http_accept_language.language_region_compatible_from(I18n.available_locales) + def http_accept_language + HttpAcceptLanguage::Parser.new(request.headers.fetch('Accept-Language')).language_region_compatible_from(I18n.available_locales) if request.headers.key?('Accept-Language') + end + + def available_locale_or_nil(locale_name) + locale_name.to_sym if locale_name.present? && I18n.available_locales.include?(locale_name.to_sym) end end diff --git a/config/application.rb b/config/application.rb index 561722884..c6f775162 100644 --- a/config/application.rb +++ b/config/application.rb @@ -149,10 +149,14 @@ module Mastodon :'zh-TW', ] - config.i18n.default_locale = ENV['DEFAULT_LOCALE']&.to_sym + config.i18n.default_locale = begin + custom_default_locale = ENV['DEFAULT_LOCALE']&.to_sym - unless config.i18n.available_locales.include?(config.i18n.default_locale) - config.i18n.default_locale = :en + if config.i18n.available_locales.include?(custom_default_locale) + custom_default_locale + else + :en + end end # config.paths.add File.join('app', 'api'), glob: File.join('**', '*.rb') @@ -169,7 +173,6 @@ module Mastodon Doorkeeper::Application.send :include, ApplicationExtension Doorkeeper::AccessToken.send :include, AccessTokenExtension Devise::FailureApp.send :include, AbstractController::Callbacks - Devise::FailureApp.send :include, HttpAcceptLanguage::EasyAccess Devise::FailureApp.send :include, Localized end end From b6d7726ecbc833abd00f6a9d36b24d9776cfe623 Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Tue, 8 Feb 2022 02:41:17 +0100 Subject: [PATCH 7/7] Remove language detection through cld3 (#17478) * Remove language detection through cld3 * Update app/helpers/languages_helper.rb Co-authored-by: Yamagishi Kazutoshi Co-authored-by: Yamagishi Kazutoshi --- Gemfile | 2 - Gemfile.lock | 5 - app/helpers/languages_helper.rb | 291 +++++++++++++----- app/helpers/settings_helper.rb | 2 +- app/lib/activitypub/activity/create.rb | 6 +- app/lib/language_detector.rb | 101 ------ app/lib/link_details_extractor.rb | 9 +- app/models/user.rb | 4 + .../process_status_update_service.rb | 6 +- app/services/post_status_service.rb | 7 +- app/validators/import_validator.rb | 2 + .../settings/preferences/other/show.html.haml | 4 +- config/locales/en.yml | 2 +- lib/tasks/repo.rake | 5 +- spec/helpers/languages_helper_spec.rb | 6 +- spec/lib/language_detector_spec.rb | 134 -------- 16 files changed, 238 insertions(+), 348 deletions(-) delete mode 100644 app/lib/language_detector.rb delete mode 100644 spec/lib/language_detector_spec.rb diff --git a/Gemfile b/Gemfile index afed1ac94..e869e5f7a 100644 --- a/Gemfile +++ b/Gemfile @@ -29,9 +29,7 @@ gem 'addressable', '~> 2.8' gem 'bootsnap', '~> 1.10.2', require: false gem 'browser' gem 'charlock_holmes', '~> 0.7.7' -gem 'iso-639' gem 'chewy', '~> 7.2' -gem 'cld3', '~> 3.4.4' gem 'devise', '~> 4.8' gem 'devise-two-factor', '~> 4.0' diff --git a/Gemfile.lock b/Gemfile.lock index 5ecddec12..f7dd292dd 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -152,8 +152,6 @@ GEM elasticsearch (>= 7.12.0, < 7.14.0) elasticsearch-dsl chunky_png (1.4.0) - cld3 (3.4.4) - ffi (>= 1.1.0, < 1.16.0) climate_control (0.2.0) coderay (1.1.3) color_diff (0.1) @@ -301,7 +299,6 @@ GEM terminal-table (>= 1.5.1) idn-ruby (0.1.4) ipaddress (0.8.3) - iso-639 (0.3.5) jmespath (1.5.0) json (2.5.1) json-canonicalization (0.3.0) @@ -698,7 +695,6 @@ DEPENDENCIES capybara (~> 3.36) charlock_holmes (~> 0.7.7) chewy (~> 7.2) - cld3 (~> 3.4.4) climate_control (~> 0.2) color_diff (~> 0.1) concurrent-ruby @@ -725,7 +721,6 @@ DEPENDENCIES httplog (~> 1.5.0) i18n-tasks (~> 0.9) idn-ruby - iso-639 json-ld json-ld-preloaded (~> 3.2) kaminari (~> 1.2) diff --git a/app/helpers/languages_helper.rb b/app/helpers/languages_helper.rb index 730724208..f3ed7b314 100644 --- a/app/helpers/languages_helper.rb +++ b/app/helpers/languages_helper.rb @@ -1,94 +1,237 @@ # frozen_string_literal: true module LanguagesHelper - HUMAN_LOCALES = { - af: 'Afrikaans', - ar: 'العربية', - ast: 'Asturianu', - bg: 'Български', - bn: 'বাংলা', - br: 'Breton', - ca: 'Català', - co: 'Corsu', - cs: 'Čeština', - cy: 'Cymraeg', - da: 'Dansk', - de: 'Deutsch', - el: 'Ελληνικά', - en: 'English', - eo: 'Esperanto', + ISO_639_1 = { + aa: ['Afar', 'Afaraf'].freeze, + ab: ['Abkhaz', 'аҧсуа бызшәа'].freeze, + ae: ['Avestan', 'avesta'].freeze, + af: ['Afrikaans', 'Afrikaans'].freeze, + ak: ['Akan', 'Akan'].freeze, + am: ['Amharic', 'አማርኛ'].freeze, + an: ['Aragonese', 'aragonés'].freeze, + ar: ['Arabic', 'اللغة العربية'].freeze, + as: ['Assamese', 'অসমীয়া'].freeze, + av: ['Avaric', 'авар мацӀ'].freeze, + ay: ['Aymara', 'aymar aru'].freeze, + az: ['Azerbaijani', 'azərbaycan dili'].freeze, + ba: ['Bashkir', 'башҡорт теле'].freeze, + be: ['Belarusian', 'беларуская мова'].freeze, + bg: ['Bulgarian', 'български език'].freeze, + bh: ['Bihari', 'भोजपुरी'].freeze, + bi: ['Bislama', 'Bislama'].freeze, + bm: ['Bambara', 'bamanankan'].freeze, + bn: ['Bengali', 'বাংলা'].freeze, + bo: ['Tibetan', 'བོད་ཡིག'].freeze, + br: ['Breton', 'brezhoneg'].freeze, + bs: ['Bosnian', 'bosanski jezik'].freeze, + ca: ['Catalan', 'Català'].freeze, + ce: ['Chechen', 'нохчийн мотт'].freeze, + ch: ['Chamorro', 'Chamoru'].freeze, + co: ['Corsican', 'corsu'].freeze, + cr: ['Cree', 'ᓀᐦᐃᔭᐍᐏᐣ'].freeze, + cs: ['Czech', 'čeština'].freeze, + cu: ['Old Church Slavonic', 'ѩзыкъ словѣньскъ'].freeze, + cv: ['Chuvash', 'чӑваш чӗлхи'].freeze, + cy: ['Welsh', 'Cymraeg'].freeze, + da: ['Danish', 'dansk'].freeze, + de: ['German', 'Deutsch'].freeze, + dv: ['Divehi', 'Dhivehi'].freeze, + dz: ['Dzongkha', 'རྫོང་ཁ'].freeze, + ee: ['Ewe', 'Eʋegbe'].freeze, + el: ['Greek', 'Ελληνικά'].freeze, + en: ['English', 'English'].freeze, + eo: ['Esperanto', 'Esperanto'].freeze, + es: ['Spanish', 'Español'].freeze, + et: ['Estonian', 'eesti'].freeze, + eu: ['Basque', 'euskara'].freeze, + fa: ['Persian', 'فارسی'].freeze, + ff: ['Fula', 'Fulfulde'].freeze, + fi: ['Finnish', 'suomi'].freeze, + fj: ['Fijian', 'Vakaviti'].freeze, + fo: ['Faroese', 'føroyskt'].freeze, + fr: ['French', 'Français'].freeze, + fy: ['Western Frisian', 'Frysk'].freeze, + ga: ['Irish', 'Gaeilge'].freeze, + gd: ['Scottish Gaelic', 'Gàidhlig'].freeze, + gl: ['Galician', 'galego'].freeze, + gu: ['Gujarati', 'ગુજરાતી'].freeze, + gv: ['Manx', 'Gaelg'].freeze, + ha: ['Hausa', 'هَوُسَ'].freeze, + he: ['Hebrew', 'עברית'].freeze, + hi: ['Hindi', 'हिन्दी'].freeze, + ho: ['Hiri Motu', 'Hiri Motu'].freeze, + hr: ['Croatian', 'Hrvatski'].freeze, + ht: ['Haitian', 'Kreyòl ayisyen'].freeze, + hu: ['Hungarian', 'magyar'].freeze, + hy: ['Armenian', 'Հայերեն'].freeze, + hz: ['Herero', 'Otjiherero'].freeze, + ia: ['Interlingua', 'Interlingua'].freeze, + id: ['Indonesian', 'Bahasa Indonesia'].freeze, + ie: ['Interlingue', 'Interlingue'].freeze, + ig: ['Igbo', 'Asụsụ Igbo'].freeze, + ii: ['Nuosu', 'ꆈꌠ꒿ Nuosuhxop'].freeze, + ik: ['Inupiaq', 'Iñupiaq'].freeze, + io: ['Ido', 'Ido'].freeze, + is: ['Icelandic', 'Íslenska'].freeze, + it: ['Italian', 'Italiano'].freeze, + iu: ['Inuktitut', 'ᐃᓄᒃᑎᑐᑦ'].freeze, + ja: ['Japanese', '日本語'].freeze, + jv: ['Javanese', 'basa Jawa'].freeze, + ka: ['Georgian', 'ქართული'].freeze, + kg: ['Kongo', 'Kikongo'].freeze, + ki: ['Kikuyu', 'Gĩkũyũ'].freeze, + kj: ['Kwanyama', 'Kuanyama'].freeze, + kk: ['Kazakh', 'қазақ тілі'].freeze, + kl: ['Kalaallisut', 'kalaallisut'].freeze, + km: ['Khmer', 'ខេមរភាសា'].freeze, + kn: ['Kannada', 'ಕನ್ನಡ'].freeze, + ko: ['Korean', '한국어'].freeze, + kr: ['Kanuri', 'Kanuri'].freeze, + ks: ['Kashmiri', 'कश्मीरी'].freeze, + ku: ['Kurdish', 'Kurdî'].freeze, + kv: ['Komi', 'коми кыв'].freeze, + kw: ['Cornish', 'Kernewek'].freeze, + ky: ['Kyrgyz', 'Кыргызча'].freeze, + la: ['Latin', 'latine'].freeze, + lb: ['Luxembourgish', 'Lëtzebuergesch'].freeze, + lg: ['Ganda', 'Luganda'].freeze, + li: ['Limburgish', 'Limburgs'].freeze, + ln: ['Lingala', 'Lingála'].freeze, + lo: ['Lao', 'ພາສາ'].freeze, + lt: ['Lithuanian', 'lietuvių kalba'].freeze, + lu: ['Luba-Katanga', 'Tshiluba'].freeze, + lv: ['Latvian', 'latviešu valoda'].freeze, + mg: ['Malagasy', 'fiteny malagasy'].freeze, + mh: ['Marshallese', 'Kajin M̧ajeļ'].freeze, + mi: ['Māori', 'te reo Māori'].freeze, + mk: ['Macedonian', 'македонски јазик'].freeze, + ml: ['Malayalam', 'മലയാളം'].freeze, + mn: ['Mongolian', 'Монгол хэл'].freeze, + mr: ['Marathi', 'मराठी'].freeze, + ms: ['Malay', 'Bahasa Malaysia'].freeze, + mt: ['Maltese', 'Malti'].freeze, + my: ['Burmese', 'ဗမာစာ'].freeze, + na: ['Nauru', 'Ekakairũ Naoero'].freeze, + nb: ['Norwegian Bokmål', 'Norsk bokmål'].freeze, + nd: ['Northern Ndebele', 'isiNdebele'].freeze, + ne: ['Nepali', 'नेपाली'].freeze, + ng: ['Ndonga', 'Owambo'].freeze, + nl: ['Dutch', 'Nederlands'].freeze, + nn: ['Norwegian Nynorsk', 'Norsk nynorsk'].freeze, + no: ['Norwegian', 'Norsk'].freeze, + nr: ['Southern Ndebele', 'isiNdebele'].freeze, + nv: ['Navajo', 'Diné bizaad'].freeze, + ny: ['Chichewa', 'chiCheŵa'].freeze, + oc: ['Occitan', 'occitan'].freeze, + oj: ['Ojibwe', 'ᐊᓂᔑᓈᐯᒧᐎᓐ'].freeze, + om: ['Oromo', 'Afaan Oromoo'].freeze, + or: ['Oriya', 'ଓଡ଼ିଆ'].freeze, + os: ['Ossetian', 'ирон æвзаг'].freeze, + pa: ['Panjabi', 'ਪੰਜਾਬੀ'].freeze, + pi: ['Pāli', 'पाऴि'].freeze, + pl: ['Polish', 'Polski'].freeze, + ps: ['Pashto', 'پښتو'].freeze, + pt: ['Portuguese', 'Português'].freeze, + qu: ['Quechua', 'Runa Simi'].freeze, + rm: ['Romansh', 'rumantsch grischun'].freeze, + rn: ['Kirundi', 'Ikirundi'].freeze, + ro: ['Romanian', 'Română'].freeze, + ru: ['Russian', 'Русский'].freeze, + rw: ['Kinyarwanda', 'Ikinyarwanda'].freeze, + sa: ['Sanskrit', 'संस्कृतम्'].freeze, + sc: ['Sardinian', 'sardu'].freeze, + sd: ['Sindhi', 'सिन्धी'].freeze, + se: ['Northern Sami', 'Davvisámegiella'].freeze, + sg: ['Sango', 'yângâ tî sängö'].freeze, + si: ['Sinhala', 'සිංහල'].freeze, + sk: ['Slovak', 'slovenčina'].freeze, + sl: ['Slovenian', 'slovenščina'].freeze, + sn: ['Shona', 'chiShona'].freeze, + so: ['Somali', 'Soomaaliga'].freeze, + sq: ['Albanian', 'Shqip'].freeze, + sr: ['Serbian', 'српски језик'].freeze, + ss: ['Swati', 'SiSwati'].freeze, + st: ['Southern Sotho', 'Sesotho'].freeze, + su: ['Sundanese', 'Basa Sunda'].freeze, + sv: ['Swedish', 'Svenska'].freeze, + sw: ['Swahili', 'Kiswahili'].freeze, + ta: ['Tamil', 'தமிழ்'].freeze, + te: ['Telugu', 'తెలుగు'].freeze, + tg: ['Tajik', 'тоҷикӣ'].freeze, + th: ['Thai', 'ไทย'].freeze, + ti: ['Tigrinya', 'ትግርኛ'].freeze, + tk: ['Turkmen', 'Türkmen'].freeze, + tl: ['Tagalog', 'Wikang Tagalog'].freeze, + tn: ['Tswana', 'Setswana'].freeze, + to: ['Tonga', 'faka Tonga'].freeze, + tr: ['Turkish', 'Türkçe'].freeze, + ts: ['Tsonga', 'Xitsonga'].freeze, + tt: ['Tatar', 'татар теле'].freeze, + tw: ['Twi', 'Twi'].freeze, + ty: ['Tahitian', 'Reo Tahiti'].freeze, + ug: ['Uyghur', 'ئۇيغۇرچە‎'].freeze, + uk: ['Ukrainian', 'Українська'].freeze, + ur: ['Urdu', 'اردو'].freeze, + uz: ['Uzbek', 'Ўзбек'].freeze, + ve: ['Venda', 'Tshivenḓa'].freeze, + vi: ['Vietnamese', 'Tiếng Việt'].freeze, + vo: ['Volapük', 'Volapük'].freeze, + wa: ['Walloon', 'walon'].freeze, + wo: ['Wolof', 'Wollof'].freeze, + xh: ['Xhosa', 'isiXhosa'].freeze, + yi: ['Yiddish', 'ייִדיש'].freeze, + yo: ['Yoruba', 'Yorùbá'].freeze, + za: ['Zhuang', 'Saɯ cueŋƅ'].freeze, + zh: ['Chinese', '中文'].freeze, + zu: ['Zulu', 'isiZulu'].freeze, + }.freeze + + ISO_639_3 = { + ast: ['Asturian', 'Asturianu'].freeze, + kab: ['Kabyle', 'Taqbaylit'].freeze, + kmr: ['Northern Kurdish', 'Kurmancî'].freeze, + zgh: ['Standard Moroccan Tamazight', 'ⵜⴰⵎⴰⵣⵉⵖⵜ'].freeze, + }.freeze + + SUPPORTED_LOCALES = {}.merge(ISO_639_1).merge(ISO_639_3).freeze + + # For ISO-639-1 and ISO-639-3 language codes, we have their official + # names, but for some translations, we need the names of the + # regional variants specifically + REGIONAL_LOCALE_NAMES = { 'es-AR': 'Español (Argentina)', 'es-MX': 'Español (México)', - es: 'Español', - et: 'Eesti', - eu: 'Euskara', - fa: 'فارسی', - fi: 'Suomi', - fr: 'Français', - ga: 'Gaeilge', - gd: 'Gàidhlig', - gl: 'Galego', - he: 'עברית', - hi: 'हिन्दी', - hr: 'Hrvatski', - hu: 'Magyar', - hy: 'Հայերեն', - id: 'Bahasa Indonesia', - io: 'Ido', - is: 'Íslenska', - it: 'Italiano', - ja: '日本語', - ka: 'ქართული', - kab: 'Taqbaylit', - kk: 'Қазақша', - kmr: 'Kurmancî', - kn: 'ಕನ್ನಡ', - ko: '한국어', - ku: 'سۆرانی', - lt: 'Lietuvių', - lv: 'Latviešu', - mk: 'Македонски', - ml: 'മലയാളം', - mr: 'मराठी', - ms: 'Bahasa Melayu', - nl: 'Nederlands', - nn: 'Nynorsk', - no: 'Norsk', - oc: 'Occitan', - pl: 'Polski', 'pt-BR': 'Português (Brasil)', 'pt-PT': 'Português (Portugal)', - pt: 'Português', - ro: 'Română', - ru: 'Русский', - sa: 'संस्कृतम्', - sc: 'Sardu', - si: 'සිංහල', - sk: 'Slovenčina', - sl: 'Slovenščina', - sq: 'Shqip', 'sr-Latn': 'Srpski (latinica)', - sr: 'Српски', - sv: 'Svenska', - ta: 'தமிழ்', - te: 'తెలుగు', - th: 'ไทย', - tr: 'Türkçe', - uk: 'Українська', - ur: 'اُردُو', - vi: 'Tiếng Việt', - zgh: 'ⵜⴰⵎⴰⵣⵉⵖⵜ', 'zh-CN': '简体中文', 'zh-HK': '繁體中文(香港)', 'zh-TW': '繁體中文(臺灣)', - zh: '中文', }.freeze def human_locale(locale) if locale == 'und' I18n.t('generic.none') + elsif (supported_locale = SUPPORTED_LOCALES[locale.to_sym]) + supported_locale[1] + elsif (regional_locale = REGIONAL_LOCALE_NAMES[locale.to_sym]) + regional_locale else - HUMAN_LOCALES[locale.to_sym] || locale + locale end end + + def valid_locale_or_nil(str) + return if str.blank? + + code, = str.to_s.split(/[_-]/) # Strip out the region from e.g. en_US or ja-JP + + return unless valid_locale?(code) + + code + end + + def valid_locale?(locale) + SUPPORTED_LOCALES.key?(locale.to_sym) + end end diff --git a/app/helpers/settings_helper.rb b/app/helpers/settings_helper.rb index 23739d1cd..3d5592867 100644 --- a/app/helpers/settings_helper.rb +++ b/app/helpers/settings_helper.rb @@ -2,7 +2,7 @@ module SettingsHelper def filterable_languages - LanguageDetector.instance.language_names.select(&LanguagesHelper::HUMAN_LOCALES.method(:key?)) + LanguagesHelper::SUPPORTED_LOCALES.keys end def hash_to_object(hash) diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb index 33998c477..ea8d146d4 100644 --- a/app/lib/activitypub/activity/create.rb +++ b/app/lib/activitypub/activity/create.rb @@ -112,7 +112,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity url: @status_parser.url || @status_parser.uri, account: @account, text: converted_object_type? ? converted_text : (@status_parser.text || ''), - language: @status_parser.language || detected_language, + language: @status_parser.language, spoiler_text: converted_object_type? ? '' : (@status_parser.spoiler_text || ''), created_at: @status_parser.created_at, edited_at: @status_parser.edited_at, @@ -370,10 +370,6 @@ class ActivityPub::Activity::Create < ActivityPub::Activity Formatter.instance.linkify([@status_parser.title.presence, @status_parser.spoiler_text.presence, @status_parser.url || @status_parser.uri].compact.join("\n\n")) end - def detected_language - LanguageDetector.instance.detect(@status_parser.text, @account) if supported_object_type? - end - def unsupported_media_type?(mime_type) mime_type.present? && !MediaAttachment.supported_mime_types.include?(mime_type) end diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb deleted file mode 100644 index 40452eddc..000000000 --- a/app/lib/language_detector.rb +++ /dev/null @@ -1,101 +0,0 @@ -# frozen_string_literal: true - -class LanguageDetector - include Singleton - - WORDS_THRESHOLD = 4 - RELIABLE_CHARACTERS_RE = /[\p{Hebrew}\p{Arabic}\p{Syriac}\p{Thaana}\p{Nko}\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}\p{Thai}]+/m - - def initialize - @identifier = CLD3::NNetLanguageIdentifier.new(1, 2048) - end - - def detect(text, account) - input_text = prepare_text(text) - - return if input_text.blank? - - detect_language_code(input_text) || default_locale(account) - end - - def language_names - @language_names = CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }.uniq - end - - private - - def prepare_text(text) - simplify_text(text).strip - end - - def unreliable_input?(text) - !reliable_input?(text) - end - - def reliable_input?(text) - sufficient_text_length?(text) || language_specific_character_set?(text) - end - - def sufficient_text_length?(text) - text.split(/\s+/).size >= WORDS_THRESHOLD - end - - def language_specific_character_set?(text) - words = text.scan(RELIABLE_CHARACTERS_RE) - - if words.present? - words.reduce(0) { |acc, elem| acc + elem.size }.to_f / text.size > 0.3 - else - false - end - end - - def detect_language_code(text) - return if unreliable_input?(text) - - result = @identifier.find_language(text) - - iso6391(result.language.to_s).to_sym if result&.reliable? - end - - def iso6391(bcp47) - iso639 = bcp47.split('-').first - - # CLD3 returns grandfathered language code for Hebrew - return 'he' if iso639 == 'iw' - - ISO_639.find(iso639).alpha2 - end - - def simplify_text(text) - new_text = remove_html(text) - new_text.gsub!(FetchLinkCardService::URL_PATTERN, '\1') - new_text.gsub!(Account::MENTION_RE, '') - new_text.gsub!(Tag::HASHTAG_RE) { |string| string.gsub(/[#_]/, '#' => '', '_' => ' ').gsub(/[a-z][A-Z]|[a-zA-Z][\d]/) { |s| s.insert(1, ' ') }.downcase } - new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '') - new_text.gsub!(/\s+/, ' ') - new_text - end - - def new_scrubber - scrubber = Rails::Html::PermitScrubber.new - scrubber.tags = %w(br p) - scrubber - end - - def scrubber - @scrubber ||= new_scrubber - end - - def remove_html(text) - text = Loofah.fragment(text).scrub!(scrubber).to_s - text.gsub!('
', "\n") - text.gsub!('

', "\n\n") - text.gsub!(/(^

|<\/p>$)/, '') - text - end - - def default_locale(account) - account.user_locale&.to_sym || I18n.default_locale if account.local? - end -end diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index d2bcf0c25..fabbd244d 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -2,6 +2,7 @@ class LinkDetailsExtractor include ActionView::Helpers::TagHelper + include LanguagesHelper # Some publications wrap their JSON-LD data in their