BigW Consortium Gitlab

Commit 030c0c53 by Lin Jen-Shin (godfat)

Merge branch 'bvl-circuitbreaker-patch' into '10-1-stable'

Circuitbreaker stable patch See merge request gitlab-org/gitlab-ce!15079
parents 6c86a586 e32712e3
......@@ -108,6 +108,43 @@ module ApplicationSettingsHelper
options_for_select(Sidekiq::Queue.all.map(&:name), @application_setting.sidekiq_throttling_queues)
end
def circuitbreaker_failure_count_help_text
health_link = link_to(s_('AdminHealthPageLink|health page'), admin_health_check_path)
api_link = link_to(s_('CircuitBreakerApiLink|circuitbreaker api'), help_page_path("api/repository_storage_health"))
message = _("The number of failures of after which GitLab will completely "\
"prevent access to the storage. The number of failures can be "\
"reset in the admin interface: %{link_to_health_page} or using "\
"the %{api_documentation_link}.")
message = message % { link_to_health_page: health_link, api_documentation_link: api_link }
message.html_safe
end
def circuitbreaker_access_retries_help_text
_('The number of attempts GitLab will make to access a storage.')
end
def circuitbreaker_backoff_threshold_help_text
_("The number of failures after which GitLab will start temporarily "\
"disabling access to a storage shard on a host")
end
def circuitbreaker_failure_wait_time_help_text
_("When access to a storage fails. GitLab will prevent access to the "\
"storage for the time specified here. This allows the filesystem to "\
"recover. Repositories on failing shards are temporarly unavailable")
end
def circuitbreaker_failure_reset_time_help_text
_("The time in seconds GitLab will keep failure information. When no "\
"failures occur during this time, information about the mount is reset.")
end
def circuitbreaker_storage_timeout_help_text
_("The time in seconds GitLab will try to access storage. After this time a "\
"timeout error will be raised.")
end
def visible_attributes
[
:admin_notification_email,
......@@ -116,6 +153,12 @@ module ApplicationSettingsHelper
:akismet_api_key,
:akismet_enabled,
:auto_devops_enabled,
:circuitbreaker_access_retries,
:circuitbreaker_backoff_threshold,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time,
:circuitbreaker_failure_wait_time,
:circuitbreaker_storage_timeout,
:clientside_sentry_dsn,
:clientside_sentry_enabled,
:container_registry_token_expire_delay,
......
......@@ -16,17 +16,16 @@ module StorageHealthHelper
def message_for_circuit_breaker(circuit_breaker)
maximum_failures = circuit_breaker.failure_count_threshold
current_failures = circuit_breaker.failure_count
permanently_broken = circuit_breaker.circuit_broken? && current_failures >= maximum_failures
translation_params = { number_of_failures: current_failures,
maximum_failures: maximum_failures,
number_of_seconds: circuit_breaker.failure_wait_time }
if permanently_broken
if circuit_breaker.circuit_broken?
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params
elsif circuit_breaker.circuit_broken?
elsif circuit_breaker.backing_off?
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"block access for %{number_of_seconds} seconds.") % translation_params
else
......
......@@ -153,6 +153,25 @@ class ApplicationSetting < ActiveRecord::Base
presence: true,
numericality: { greater_than_or_equal_to: 0 }
validates :circuitbreaker_backoff_threshold,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_wait_time,
:circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 }
validates :circuitbreaker_access_retries,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 1 }
validates_each :circuitbreaker_backoff_threshold do |record, attr, value|
if value.to_i >= record.circuitbreaker_failure_count_threshold
record.errors.add(attr, _("The circuitbreaker backoff threshold should be "\
"lower than the failure count threshold"))
end
end
SUPPORTED_KEY_TYPES.each do |type|
validates :"#{type}_key_restriction", presence: true, key_restriction: { type: type }
end
......
......@@ -530,6 +530,44 @@
= succeed "." do
= link_to "repository storages documentation", help_page_path("administration/repository_storages")
%fieldset
%legend Git Storage Circuitbreaker settings
.form-group
= f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_access_retries, class: 'form-control'
.help-block
= circuitbreaker_access_retries_help_text
.form-group
= f.label :circuitbreaker_storage_timeout, _('Seconds to wait for a storage access attempt'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_storage_timeout, class: 'form-control'
.help-block
= circuitbreaker_storage_timeout_help_text
.form-group
= f.label :circuitbreaker_backoff_threshold, _('Number of failures before backing off'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_backoff_threshold, class: 'form-control'
.help-block
= circuitbreaker_backoff_threshold_help_text
.form-group
= f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_failure_wait_time, class: 'form-control'
.help-block
= circuitbreaker_failure_wait_time_help_text
.form-group
= f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control'
.help-block
= circuitbreaker_failure_count_help_text
.form-group
= f.label :circuitbreaker_failure_reset_time, _('Seconds before reseting failure information'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_failure_reset_time, class: 'form-control'
.help-block
= circuitbreaker_failure_reset_time_help_text
%fieldset
%legend Repository Checks
......
---
title: Make the circuitbreaker more robust by adding higher thresholds, and multiple
access attempts.
merge_request: 14933
author:
type: fixed
---
title: Store circuitbreaker settings in the database instead of config
merge_request: 14842
author:
type: changed
......@@ -522,11 +522,6 @@ production: &base
path: /home/git/repositories/
gitaly_address: unix:/home/git/gitlab/tmp/sockets/private/gitaly.socket # TCP connections are supported too (e.g. tcp://host:port)
# gitaly_token: 'special token' # Optional: override global gitaly.token for this storage.
failure_count_threshold: 10 # number of failures before stopping attempts
failure_wait_time: 30 # Seconds after an access failure before allowing access again
failure_reset_time: 1800 # Time in seconds to expire failures
storage_timeout: 30 # Time in seconds to wait before aborting a storage access attempt
## Backup settings
backup:
......@@ -659,9 +654,6 @@ test:
default:
path: tmp/tests/repositories/
gitaly_address: unix:tmp/tests/gitaly/gitaly.socket
failure_count_threshold: 999999
failure_wait_time: 0
storage_timeout: 30
broken:
path: tmp/tests/non-existent-repositories
gitaly_address: unix:tmp/tests/gitaly/gitaly.socket
......
......@@ -455,17 +455,6 @@ Settings.repositories.storages.each do |key, storage|
# Expand relative paths
storage['path'] = Settings.absolute(storage['path'])
# Set failure defaults
storage['failure_count_threshold'] ||= 10
storage['failure_wait_time'] ||= 30
storage['failure_reset_time'] ||= 1800
storage['storage_timeout'] ||= 5
# Set turn strings into numbers
storage['failure_count_threshold'] = storage['failure_count_threshold'].to_i
storage['failure_wait_time'] = storage['failure_wait_time'].to_i
storage['failure_reset_time'] = storage['failure_reset_time'].to_i
# We might want to have a timeout shorter than 1 second.
storage['storage_timeout'] = storage['storage_timeout'].to_f
Settings.repositories.storages[key] = storage
end
......
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class AddCircuitBreakerPropertiesToApplicationSettings < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def change
add_column :application_settings,
:circuitbreaker_failure_count_threshold,
:integer,
default: 160
add_column :application_settings,
:circuitbreaker_failure_wait_time,
:integer,
default: 30
add_column :application_settings,
:circuitbreaker_failure_reset_time,
:integer,
default: 1800
add_column :application_settings,
:circuitbreaker_storage_timeout,
:integer,
default: 30
end
end
class AddNewCircuitbreakerSettingsToApplicationSettings < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def change
add_column :application_settings,
:circuitbreaker_access_retries,
:integer,
default: 3
add_column :application_settings,
:circuitbreaker_backoff_threshold,
:integer,
default: 80
end
end
......@@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20171006091000) do
ActiveRecord::Schema.define(version: 20171017145932) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
......@@ -134,6 +134,12 @@ ActiveRecord::Schema.define(version: 20171006091000) do
t.boolean "hashed_storage_enabled", default: false, null: false
t.boolean "project_export_enabled", default: true, null: false
t.boolean "auto_devops_enabled", default: false, null: false
t.integer "circuitbreaker_failure_count_threshold", default: 160
t.integer "circuitbreaker_failure_wait_time", default: 30
t.integer "circuitbreaker_failure_reset_time", default: 1800
t.integer "circuitbreaker_storage_timeout", default: 30
t.integer "circuitbreaker_access_retries", default: 3
t.integer "circuitbreaker_backoff_threshold", default: 80
end
create_table "audit_events", force: :cascade do |t|
......
......@@ -105,61 +105,40 @@ When GitLab detects access to the repositories storage fails repeatedly, it can
gracefully prevent attempts to access the storage. This might be useful when
the repositories are stored somewhere on the network.
The configuration could look as follows:
This can be configured from the admin interface:
**For Omnibus installations**
1. Edit `/etc/gitlab/gitlab.rb`:
![circuitbreaker configuration](img/circuitbreaker_config.png)
```ruby
git_data_dirs({
"default" => {
"path" => "/mnt/nfs-01/git-data",
"failure_count_threshold" => 10,
"failure_wait_time" => 30,
"failure_reset_time" => 1800,
"storage_timeout" => 5
}
})
```
1. Save the file and [reconfigure GitLab][reconfigure-gitlab] for the changes to take effect.
---
**For installations from source**
**Number of access attempts**: The number of attempts GitLab will make to access a
storage when probing a shard.
1. Edit `config/gitlab.yml`:
**Number of failures before backing off**: The number of failures after which
GitLab will start temporarily disabling access to a storage shard on a host.
```yaml
repositories:
storages: # You must have at least a `default` storage path.
default:
path: /home/git/repositories/
failure_count_threshold: 10 # number of failures before stopping attempts
failure_wait_time: 30 # Seconds after last access failure before trying again
failure_reset_time: 1800 # Time in seconds to expire failures
storage_timeout: 5 # Time in seconds to wait before aborting a storage access attempt
```
1. Save the file and [restart GitLab][restart-gitlab] for the changes to take effect.
**`failure_count_threshold`:** The number of failures of after which GitLab will
**Maximum git storage failures:** The number of failures of after which GitLab will
completely prevent access to the storage. The number of failures can be reset in
the admin interface: `https://gitlab.example.com/admin/health_check` or using the
[api](../api/repository_storage_health.md) to allow access to the storage again.
**`failure_wait_time`:** When access to a storage fails. GitLab will prevent
access to the storage for the time specified here. This allows the filesystem to
recover without.
**Seconds to wait after a storage failure:** When access to a storage fails. GitLab
will prevent access to the storage for the time specified here. This allows the
filesystem to recover.
**`failure_reset_time`:** The time in seconds GitLab will keep failure
information. When no failures occur during this time, information about the
**Seconds before reseting failure information:** The time in seconds GitLab will
keep failure information. When no failures occur during this time, information about the
mount is reset.
**`storage_timeout`:** The time in seconds GitLab will try to access storage.
After this time a timeout error will be raised.
**Seconds to wait for a storage access attempt:** The time in seconds GitLab will
try to access storage. After this time a timeout error will be raised.
To enable the circuitbreaker for repository storage you can flip the feature flag from a rails console:
```
Feature.enable('git_storage_circuit_breaker')
```
Alternatively it can be enabled by setting `true` in the `GIT_STORAGE_CIRCUIT_BREAKER` environment variable.
This approach would be used when enabling the circuit breaker on a single host.
When storage failures occur, this will be visible in the admin interface like this:
......
......@@ -96,6 +96,12 @@ PUT /application/settings
| `dsa_key_restriction` | integer | no | The minimum allowed bit length of an uploaded DSA key. Default is `0` (no restriction). `-1` disables DSA keys.
| `ecdsa_key_restriction` | integer | no | The minimum allowed curve size (in bits) of an uploaded ECDSA key. Default is `0` (no restriction). `-1` disables ECDSA keys.
| `ed25519_key_restriction` | integer | no | The minimum allowed curve size (in bits) of an uploaded ED25519 key. Default is `0` (no restriction). `-1` disables ED25519 keys.
| `circuitbreaker_access_retries | integer | no | The number of attempts GitLab will make to access a storage. |
| `circuitbreaker_backoff_threshold | integer | no | The number of failures after which GitLab will start temporarily disabling access to a storage shard on a host. |
| `circuitbreaker_failure_count_threshold` | integer | no | The number of failures of after which GitLab will completely prevent access to the storage. |
| `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. |
| `circuitbreaker_failure_wait_time` | integer | no | Time in seconds GitLab will block access to a failing storage to allow it to recover. |
| `circuitbreaker_storage_timeout` | integer | no | Seconds to wait for a storage access attempt |
```bash
curl --request PUT --header "PRIVATE-TOKEN: 9koXpg98eAheJpvBs5tK" https://gitlab.example.com/api/v4/application/settings?signup_enabled=false&default_project_visibility=internal
......
......@@ -12,6 +12,7 @@ module Gitlab
CircuitOpen = Class.new(Inaccessible)
Misconfiguration = Class.new(Inaccessible)
Failing = Class.new(Inaccessible)
REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
......
......@@ -2,15 +2,13 @@ module Gitlab
module Git
module Storage
class CircuitBreaker
include CircuitBreakerSettings
FailureInfo = Struct.new(:last_failure, :failure_count)
attr_reader :storage,
:hostname,
:storage_path,
:failure_count_threshold,
:failure_wait_time,
:failure_reset_time,
:storage_timeout
:storage_path
delegate :last_failure, :failure_count, to: :failure_info
......@@ -53,14 +51,10 @@ module Gitlab
config = Gitlab.config.repositories.storages[@storage]
@storage_path = config['path']
@failure_count_threshold = config['failure_count_threshold']
@failure_wait_time = config['failure_wait_time']
@failure_reset_time = config['failure_reset_time']
@storage_timeout = config['storage_timeout']
end
def perform
return yield unless Feature.enabled?('git_storage_circuit_breaker')
return yield unless enabled?
check_storage_accessible!
......@@ -70,10 +64,27 @@ module Gitlab
def circuit_broken?
return false if no_failures?
failure_count > failure_count_threshold
end
def backing_off?
return false if no_failures?
recent_failure = last_failure > failure_wait_time.seconds.ago
too_many_failures = failure_count > failure_count_threshold
too_many_failures = failure_count > backoff_threshold
recent_failure || too_many_failures
recent_failure && too_many_failures
end
private
# The circuitbreaker can be enabled for the entire fleet using a Feature
# flag.
#
# Enabling it for a single host can be done setting the
# `GIT_STORAGE_CIRCUIT_BREAKER` environment variable.
def enabled?
ENV['GIT_STORAGE_CIRCUIT_BREAKER'].present? || Feature.enabled?('git_storage_circuit_breaker')
end
def failure_info
......@@ -89,7 +100,7 @@ module Gitlab
return @storage_available if @storage_available
if @storage_available = Gitlab::Git::Storage::ForkedStorageCheck
.storage_available?(storage_path, storage_timeout)
.storage_available?(storage_path, storage_timeout, access_retries)
track_storage_accessible
else
track_storage_inaccessible
......@@ -100,7 +111,11 @@ module Gitlab
def check_storage_accessible!
if circuit_broken?
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_wait_time)
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time)
end
if backing_off?
raise Gitlab::Git::Storage::Failing.new("Backing off access to #{storage}", failure_wait_time)
end
unless storage_available?
......@@ -137,12 +152,6 @@ module Gitlab
end
end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
private
def get_failure_info
last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, :last_failure, :failure_count)
......@@ -152,6 +161,10 @@ module Gitlab
FailureInfo.new(last_failure, failure_count.to_i)
end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
end
end
end
......
module Gitlab
module Git
module Storage
module CircuitBreakerSettings
def failure_count_threshold
application_settings.circuitbreaker_failure_count_threshold
end
def failure_wait_time
application_settings.circuitbreaker_failure_wait_time
end
def failure_reset_time
application_settings.circuitbreaker_failure_reset_time
end
def storage_timeout
application_settings.circuitbreaker_storage_timeout
end
def access_retries
application_settings.circuitbreaker_access_retries
end
def backoff_threshold
application_settings.circuitbreaker_backoff_threshold
end
private
def application_settings
Gitlab::CurrentSettings.current_application_settings
end
end
end
end
end
......@@ -4,8 +4,17 @@ module Gitlab
module ForkedStorageCheck
extend self
def storage_available?(path, timeout_seconds = 5)
status = timeout_check(path, timeout_seconds)
def storage_available?(path, timeout_seconds = 5, retries = 1)
partial_timeout = timeout_seconds / retries
status = timeout_check(path, partial_timeout)
# If the status check did not succeed the first time, we retry a few
# more times to avoid one-off failures
current_attempts = 1
while current_attempts < retries && !status.success?
status = timeout_check(path, partial_timeout)
current_attempts += 1
end
status.success?
end
......
......@@ -2,15 +2,14 @@ module Gitlab
module Git
module Storage
class NullCircuitBreaker
include CircuitBreakerSettings
# These will have actual values
attr_reader :storage,
:hostname
# These will always have nil values
attr_reader :storage_path,
:failure_wait_time,
:failure_reset_time,
:storage_timeout
attr_reader :storage_path
def initialize(storage, hostname, error: nil)
@storage = storage
......@@ -26,8 +25,8 @@ module Gitlab
!!@error
end
def failure_count_threshold
1
def backing_off?
false
end
def last_failure
......@@ -35,7 +34,7 @@ module Gitlab
end
def failure_count
circuit_broken? ? 1 : 0
circuit_broken? ? failure_count_threshold : 0
end
def failure_info
......
......@@ -65,9 +65,11 @@ feature "Admin Health Check", :feature, :broken_storage do
it 'shows storage failure information' do
hostname = Gitlab::Environment.hostname
maximum_failures = Gitlab::CurrentSettings.current_application_settings
.circuitbreaker_failure_count_threshold
expect(page).to have_content('broken: failed storage access attempt on host:')
expect(page).to have_content("#{hostname}: 1 of 10 failures.")
expect(page).to have_content("#{hostname}: 1 of #{maximum_failures} failures.")
end
it 'allows resetting storage failures' do
......
......@@ -18,26 +18,6 @@ describe Settings do
end
end
describe '#repositories' do
it 'assigns the default failure attributes' do
repository_settings = Gitlab.config.repositories.storages['broken']
expect(repository_settings['failure_count_threshold']).to eq(10)
expect(repository_settings['failure_wait_time']).to eq(30)
expect(repository_settings['failure_reset_time']).to eq(1800)
expect(repository_settings['storage_timeout']).to eq(5)
end
it 'can be accessed with dot syntax all the way down' do
expect(Gitlab.config.repositories.storages.broken.failure_count_threshold).to eq(10)
end
it 'can be accessed in a very specific way that breaks without reassigning each element with Settingslogic' do
storage_settings = Gitlab.config.repositories.storages['broken']
expect(storage_settings.failure_count_threshold).to eq(10)
end
end
describe '#host_without_www' do
context 'URL with protocol' do
it 'returns the host' do
......
......@@ -10,18 +10,10 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
# Override test-settings for the circuitbreaker with something more realistic
# for these specs.
stub_storage_settings('default' => {
'path' => TestEnv.repos_path,
'failure_count_threshold' => 10,
'failure_wait_time' => 30,
'failure_reset_time' => 1800,
'storage_timeout' => 5
'path' => TestEnv.repos_path
},
'broken' => {
'path' => 'tmp/tests/non-existent-repositories',
'failure_count_threshold' => 10,
'failure_wait_time' => 30,
'failure_reset_time' => 1800,
'storage_timeout' => 5
'path' => 'tmp/tests/non-existent-repositories'
},
'nopath' => { 'path' => nil }
)
......@@ -79,241 +71,222 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
expect(circuit_breaker.hostname).to eq(hostname)
expect(circuit_breaker.storage).to eq('default')
expect(circuit_breaker.storage_path).to eq(TestEnv.repos_path)
expect(circuit_breaker.failure_count_threshold).to eq(10)
expect(circuit_breaker.failure_wait_time).to eq(30)
expect(circuit_breaker.failure_reset_time).to eq(1800)
expect(circuit_breaker.storage_timeout).to eq(5)
end
end
describe '#perform' do
it 'raises an exception with retry time when the circuit is open' do
allow(circuit_breaker).to receive(:circuit_broken?).and_return(true)
expect { |b| circuit_breaker.perform(&b) }
.to raise_error(Gitlab::Git::Storage::CircuitOpen)
context 'circuitbreaker settings' do
before do
stub_application_setting(circuitbreaker_failure_count_threshold: 0,
circuitbreaker_failure_wait_time: 1,
circuitbreaker_failure_reset_time: 2,
circuitbreaker_storage_timeout: 3,
circuitbreaker_access_retries: 4,
circuitbreaker_backoff_threshold: 5)
end
it 'yields the block' do
expect { |b| circuit_breaker.perform(&b) }
.to yield_control
describe '#failure_count_threshold' do
it 'reads the value from settings' do
expect(circuit_breaker.failure_count_threshold).to eq(0)
end
it 'checks if the storage is available' do
expect(circuit_breaker).to receive(:check_storage_accessible!)
circuit_breaker.perform { 'hello world' }
end
it 'returns the value of the block' do
result = circuit_breaker.perform { 'return value' }
expect(result).to eq('return value')
describe '#failure_wait_time' do
it 'reads the value from settings' do
expect(circuit_breaker.failure_wait_time).to eq(1)
end
it 'raises possible errors' do
expect { circuit_breaker.perform { raise Rugged::OSError.new('Broken') } }
.to raise_error(Rugged::OSError)
end
context 'with the feature disabled' do
it 'returns the block without checking accessibility' do
stub_feature_flags(git_storage_circuit_breaker: false)
describe '#failure_reset_time' do
it 'reads the value from settings' do
expect(circuit_breaker.failure_reset_time).to eq(2)
end
end
expect(circuit_breaker).not_to receive(:circuit_broken?)
describe '#storage_timeout' do
it 'reads the value from settings' do
expect(circuit_breaker.storage_timeout).to eq(3)
end
end
result = circuit_breaker.perform { 'hello' }
describe '#access_retries' do
it 'reads the value from settings' do
expect(circuit_breaker.access_retries).to eq(4)
end
end
expect(result).to eq('hello')
describe '#backoff_threshold' do
it 'reads the value from settings' do
expect(circuit_breaker.backoff_threshold).to eq(5)
end
end
end
describe '#circuit_broken?' do
it 'is working when there is no last failure' do
set_in_redis(:last_failure, nil)
set_in_redis(:failure_count, 0)
describe '#perform' do
it 'raises the correct exception when the circuit is open' do
set_in_redis(:last_failure, 1.day.ago.to_f)
set_in_redis(:failure_count, 999)
expect(circuit_breaker.circuit_broken?).to be_falsey
expect { |b| circuit_breaker.perform(&b) }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen)
expect(exception.retry_after).to eq(1800)
end
end
it 'is broken when there was a recent failure' do
it 'raises the correct exception when backing off' do
Timecop.freeze do
set_in_redis(:last_failure, 1.second.ago.to_f)
set_in_redis(:failure_count, 1)
set_in_redis(:failure_count, 90)
expect(circuit_breaker.circuit_broken?).to be_truthy
expect { |b| circuit_breaker.perform(&b) }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::Failing)
expect(exception.retry_after).to eq(30)
end
end
it 'is broken when there are too many failures' do
set_in_redis(:last_failure, 1.day.ago.to_f)
set_in_redis(:failure_count, 200)
expect(circuit_breaker.circuit_broken?).to be_truthy
end
context 'the `failure_wait_time` is set to 0' do
before do
stub_storage_settings('default' => {
'failure_wait_time' => 0,
'path' => TestEnv.repos_path
})
it 'yields the block' do
expect { |b| circuit_breaker.perform(&b) }
.to yield_control
end
it 'is working even when there is a recent failure' do
Timecop.freeze do
set_in_redis(:last_failure, 0.seconds.ago.to_f)
set_in_redis(:failure_count, 1)
it 'checks if the storage is available' do
expect(circuit_breaker).to receive(:check_storage_accessible!)
.and_call_original
expect(circuit_breaker.circuit_broken?).to be_falsey
end
circuit_breaker.perform { 'hello world' }
end
it 'returns the value of the block' do
result = circuit_breaker.perform { 'return value' }
expect(result).to eq('return value')
end
it 'raises possible errors' do
expect { circuit_breaker.perform { raise Rugged::OSError.new('Broken') } }
.to raise_error(Rugged::OSError)
end
describe "storage_available?" do
context 'the storage is available' do
it 'tracks that the storage was accessible an raises the error' do
expect(circuit_breaker).to receive(:track_storage_accessible)
it 'tracks that the storage was accessible' do
set_in_redis(:failure_count, 10)
set_in_redis(:last_failure, Time.now.to_f)
circuit_breaker.perform { '' }
circuit_breaker.storage_available?
expect(value_from_redis(:failure_count).to_i).to eq(0)
expect(value_from_redis(:last_failure)).to be_empty
expect(circuit_breaker.failure_count).to eq(0)
expect(circuit_breaker.last_failure).to be_nil
end
it 'only performs the check once' do
it 'only performs the accessibility check once' do
expect(Gitlab::Git::Storage::ForkedStorageCheck)
.to receive(:storage_available?).once.and_call_original
2.times { circuit_breaker.storage_available? }
end
2.times { circuit_breaker.perform { '' } }
end
context 'storage is not available' do
let(:storage_name) { 'broken' }
it 'calls the check with the correct arguments' do
stub_application_setting(circuitbreaker_storage_timeout: 30,
circuitbreaker_access_retries: 3)
it 'tracks that the storage was inaccessible' do
expect(circuit_breaker).to receive(:track_storage_inaccessible)
expect(Gitlab::Git::Storage::ForkedStorageCheck)
.to receive(:storage_available?).with(TestEnv.repos_path, 30, 3)
.and_call_original
circuit_breaker.storage_available?
circuit_breaker.perform { '' }
end
context 'with the feature disabled' do
before do
stub_feature_flags(git_storage_circuit_breaker: false)
end
it 'returns the block without checking accessibility' do
expect(circuit_breaker).not_to receive(:check_storage_accessible!)
result = circuit_breaker.perform { 'hello' }
expect(result).to eq('hello')
end
describe '#check_storage_accessible!' do
it 'raises an exception with retry time when the circuit is open' do
allow(circuit_breaker).to receive(:circuit_broken?).and_return(true)
it 'allows enabling the feature using an ENV var' do
stub_env('GIT_STORAGE_CIRCUIT_BREAKER', 'true')
expect(circuit_breaker).to receive(:check_storage_accessible!)
expect { circuit_breaker.check_storage_accessible! }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen)
expect(exception.retry_after).to eq(30)
result = circuit_breaker.perform { 'hello' }
expect(result).to eq('hello')
end
end
context 'the storage is not available' do
let(:storage_name) { 'broken' }
it 'raises an error' do
it 'raises the correct exception' do
expect(circuit_breaker).to receive(:track_storage_inaccessible)
expect { circuit_breaker.check_storage_accessible! }
expect { circuit_breaker.perform { '' } }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible)
expect(exception.retry_after).to eq(30)
end
end
end
end
describe '#track_storage_inaccessible' do
around do |example|
Timecop.freeze { example.run }
end
it 'records the failure time in redis' do
circuit_breaker.track_storage_inaccessible
failure_time = value_from_redis(:last_failure)
expect(Time.at(failure_time.to_i)).to be_within(1.second).of(Time.now)
end
it 'sets the failure time on the breaker without reloading' do
circuit_breaker.track_storage_inaccessible
it 'tracks that the storage was inaccessible' do
Timecop.freeze do
expect { circuit_breaker.perform { '' } }.to raise_error(Gitlab::Git::Storage::Inaccessible)
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.last_failure).to eq(Time.now)
expect(value_from_redis(:failure_count).to_i).to eq(1)
expect(value_from_redis(:last_failure)).not_to be_empty
expect(circuit_breaker.failure_count).to eq(1)
expect(circuit_breaker.last_failure).to be_within(1.second).of(Time.now)
end
it 'increments the failure count in redis' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_inaccessible
expect(value_from_redis(:failure_count).to_i).to be(11)
end
it 'increments the failure count on the breaker without reloading' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_inaccessible
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.failure_count).to eq(11)
end
end
describe '#track_storage_accessible' do
it 'sets the failure count to zero in redis' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_accessible
describe '#circuit_broken?' do
it 'is working when there is no last failure' do
set_in_redis(:last_failure, nil)
set_in_redis(:failure_count, 0)
expect(value_from_redis(:failure_count).to_i).to be(0)
expect(circuit_breaker.circuit_broken?).to be_falsey
end
it 'sets the failure count to zero on the breaker without reloading' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_accessible
it 'is broken when there are too many failures' do
set_in_redis(:last_failure, 1.day.ago.to_f)
set_in_redis(:failure_count, 200)
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.failure_count).to eq(0)
expect(circuit_breaker.circuit_broken?).to be_truthy
end
it 'removes the last failure time from redis' do
set_in_redis(:last_failure, Time.now.to_i)
circuit_breaker.track_storage_accessible
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.last_failure).to be_nil
end
it 'removes the last failure time from the breaker without reloading' do
set_in_redis(:last_failure, Time.now.to_i)
describe '#backing_off?' do
it 'is true when there was a recent failure' do
Timecop.freeze do
set_in_redis(:last_failure, 1.second.ago.to_f)
set_in_redis(:failure_count, 90)
circuit_breaker.track_storage_accessible
expect(circuit_breaker.backing_off?).to be_truthy
end
end
expect(value_from_redis(:last_failure)).to be_empty
context 'the `failure_wait_time` is set to 0' do
before do
stub_application_setting(circuitbreaker_failure_wait_time: 0)
end
it 'wont connect to redis when there are no failures' do
expect(Gitlab::Git::Storage.redis).to receive(:with).once
.and_call_original
expect(circuit_breaker).to receive(:track_storage_accessible)
.and_call_original
it 'is working even when there are failures' do
Timecop.freeze do
set_in_redis(:last_failure, 0.seconds.ago.to_f)
set_in_redis(:failure_count, 90)
circuit_breaker.track_storage_accessible
expect(circuit_breaker.backing_off?).to be_falsey
end
end
describe '#no_failures?' do
it 'is false when a failure was tracked' do
set_in_redis(:last_failure, Time.now.to_i)
set_in_redis(:failure_count, 1)
expect(circuit_breaker.no_failures?).to be_falsey
end
end
......@@ -333,10 +306,4 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
expect(circuit_breaker.failure_count).to eq(7)
end
end
describe '#cache_key' do
it 'includes storage and host' do
expect(circuit_breaker.cache_key).to eq(cache_key)
end
end
end
......@@ -33,6 +33,21 @@ describe Gitlab::Git::Storage::ForkedStorageCheck, broken_storage: true, skip_da
expect(runtime).to be < 1.0
end
it 'will try the specified amount of times before failing' do
allow(described_class).to receive(:check_filesystem_in_process) do
Process.spawn("sleep 10")
end
expect(Process).to receive(:spawn).with('sleep 10').twice
.and_call_original
runtime = Benchmark.realtime do
described_class.storage_available?(existing_path, 0.5, 2)
end
expect(runtime).to be < 1.0
end
describe 'when using paths with spaces' do
let(:test_dir) { Rails.root.join('tmp', 'tests', 'storage_check') }
let(:path_with_spaces) { File.join(test_dir, 'path with spaces') }
......
......@@ -54,6 +54,10 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do
end
describe '#failure_count_threshold' do
before do
stub_application_setting(circuitbreaker_failure_count_threshold: 1)
end
it { expect(breaker.failure_count_threshold).to eq(1) }
end
......@@ -61,17 +65,6 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do
ours = described_class.public_instance_methods
theirs = Gitlab::Git::Storage::CircuitBreaker.public_instance_methods
# These methods are not part of the public API, but are public to allow the
# CircuitBreaker specs to operate. They should be made private over time.
exceptions = %i[
cache_key
check_storage_accessible!
no_failures?
storage_available?
track_storage_accessible
track_storage_inaccessible
]
expect(theirs - ours).to contain_exactly(*exceptions)
expect(theirs - ours).to be_empty
end
end
......@@ -114,6 +114,30 @@ describe ApplicationSetting do
it { expect(setting.repository_storages).to eq(['default']) }
end
context 'circuitbreaker settings' do
[:circuitbreaker_backoff_threshold,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_wait_time,
:circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout].each do |field|
it "Validates #{field} as number" do
is_expected.to validate_numericality_of(field)
.only_integer
.is_greater_than_or_equal_to(0)
end
end
it 'requires the `backoff_threshold` to be lower than the `failure_count_threshold`' do
setting.circuitbreaker_failure_count_threshold = 10
setting.circuitbreaker_backoff_threshold = 15
failure_message = "The circuitbreaker backoff threshold should be lower "\
"than the failure count threshold"
expect(setting).not_to be_valid
expect(setting.errors[:circuitbreaker_backoff_threshold]).to include(failure_message)
end
end
context 'repository storages' do
before do
storages = {
......
......@@ -23,6 +23,7 @@ describe API::Settings, 'Settings' do
expect(json_response['dsa_key_restriction']).to eq(0)
expect(json_response['ecdsa_key_restriction']).to eq(0)
expect(json_response['ed25519_key_restriction']).to eq(0)
expect(json_response['circuitbreaker_failure_count_threshold']).not_to be_nil
end
end
......@@ -52,7 +53,8 @@ describe API::Settings, 'Settings' do
rsa_key_restriction: ApplicationSetting::FORBIDDEN_KEY_VALUE,
dsa_key_restriction: 2048,
ecdsa_key_restriction: 384,
ed25519_key_restriction: 256
ed25519_key_restriction: 256,
circuitbreaker_failure_wait_time: 2
expect(response).to have_http_status(200)
expect(json_response['default_projects_limit']).to eq(3)
......@@ -73,6 +75,7 @@ describe API::Settings, 'Settings' do
expect(json_response['dsa_key_restriction']).to eq(2048)
expect(json_response['ecdsa_key_restriction']).to eq(384)
expect(json_response['ed25519_key_restriction']).to eq(256)
expect(json_response['circuitbreaker_failure_wait_time']).to eq(2)
end
end
......
......@@ -43,10 +43,6 @@ module StubConfiguration
messages['default'] ||= Gitlab.config.repositories.storages.default
messages.each do |storage_name, storage_settings|
storage_settings['path'] = TestEnv.repos_path unless storage_settings.key?('path')
storage_settings['failure_count_threshold'] ||= 10
storage_settings['failure_wait_time'] ||= 30
storage_settings['failure_reset_time'] ||= 1800
storage_settings['storage_timeout'] ||= 5
end
allow(Gitlab.config.repositories).to receive(:storages).and_return(Settingslogic.new(messages))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment