BigW Consortium Gitlab

Commit 9216f59a by Douwe Maan

Merge branch '24240-prometheus_healthz' into 'master'

Add /-/readiness /-/liveness and /-/health_metrics endpoints to track application readiness Closes #24240 See merge request !10416
parents 15e87cea c3e43c9b
module RequiresHealthToken
extend ActiveSupport::Concern
included do
before_action :validate_health_check_access!
end
private
def validate_health_check_access!
render_404 unless token_valid?
end
def token_valid?
token = params[:token].presence || request.headers['TOKEN']
token.present? &&
ActiveSupport::SecurityUtils.variable_size_secure_compare(
token,
current_application_settings.health_check_access_token
)
end
def render_404
render file: Rails.root.join('public', '404'), layout: false, status: '404'
end
end
class HealthCheckController < HealthCheck::HealthCheckController
before_action :validate_health_check_access!
private
def validate_health_check_access!
render_404 unless token_valid?
end
def token_valid?
token = params[:token].presence || request.headers['TOKEN']
token.present? &&
ActiveSupport::SecurityUtils.variable_size_secure_compare(
token,
current_application_settings.health_check_access_token
)
end
def render_404
render file: Rails.root.join('public', '404'), layout: false, status: '404'
end
include RequiresHealthToken
end
class HealthController < ActionController::Base
protect_from_forgery with: :exception
include RequiresHealthToken
CHECKS = [
Gitlab::HealthChecks::DbCheck,
Gitlab::HealthChecks::RedisCheck,
Gitlab::HealthChecks::FsShardsCheck,
].freeze
def readiness
results = CHECKS.map { |check| [check.name, check.readiness] }
render_check_results(results)
end
def liveness
results = CHECKS.map { |check| [check.name, check.liveness] }
render_check_results(results)
end
def metrics
results = CHECKS.flat_map(&:metrics)
response = results.map(&method(:metric_to_prom_line)).join("\n")
render text: response, content_type: 'text/plain; version=0.0.4'
end
private
def metric_to_prom_line(metric)
labels = metric.labels&.map { |key, value| "#{key}=\"#{value}\"" }&.join(',') || ''
if labels.empty?
"#{metric.name} #{metric.value}"
else
"#{metric.name}{#{labels}} #{metric.value}"
end
end
def render_check_results(results)
flattened = results.flat_map do |name, result|
if result.is_a?(Gitlab::HealthChecks::Result)
[[name, result]]
else
result.map { |r| [name, r] }
end
end
success = flattened.all? { |name, r| r.success }
response = flattened.map do |name, r|
info = { status: r.success ? 'ok' : 'failed' }
info['message'] = r.message if r.message
info[:labels] = r.labels if r.labels
[name, info]
end
render json: response.to_h, status: success ? :ok : :service_unavailable
end
end
---
title: Add /-/readiness /-/liveness and /-/metrics endpoints to track application health
merge_request: 10416
author:
......@@ -39,6 +39,12 @@ Rails.application.routes.draw do
# Health check
get 'health_check(/:checks)' => 'health_check#index', as: :health_check
scope path: '-', controller: 'health' do
get :liveness
get :readiness
get :metrics
end
# Koding route
get 'koding' => 'koding#index'
......
module Gitlab
module HealthChecks
module BaseAbstractCheck
def name
super.demodulize.underscore
end
def human_name
name.sub(/_check$/, '').capitalize
end
def readiness
raise NotImplementedError
end
def liveness
HealthChecks::Result.new(true)
end
def metrics
[]
end
protected
def metric(name, value, **labels)
Metric.new(name, value, labels)
end
def with_timing(proc)
start = Time.now
result = proc.call
yield result, Time.now.to_f - start.to_f
end
def catch_timeout(seconds, &block)
begin
Timeout.timeout(seconds.to_i, &block)
rescue Timeout::Error => ex
ex
end
end
end
end
end
module Gitlab
module HealthChecks
class DbCheck
extend SimpleAbstractCheck
class << self
private
def metric_prefix
'db_ping'
end
def is_successful?(result)
result == '1'
end
def check
catch_timeout 10.seconds do
if Gitlab::Database.postgresql?
ActiveRecord::Base.connection.execute('SELECT 1 as ping')&.first&.[]('ping')
else
ActiveRecord::Base.connection.execute('SELECT 1 as ping')&.first&.first&.to_s
end
end
end
end
end
end
end
module Gitlab
module HealthChecks
class FsShardsCheck
extend BaseAbstractCheck
class << self
def readiness
repository_storages.map do |storage_name|
begin
tmp_file_path = tmp_file_path(storage_name)
if !storage_stat_test(storage_name)
HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
elsif !storage_write_test(tmp_file_path)
HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
elsif !storage_read_test(tmp_file_path)
HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
else
HealthChecks::Result.new(true, nil, shard: storage_name)
end
rescue RuntimeError => ex
message = "unexpected error #{ex} when checking storage #{storage_name}"
Rails.logger.error(message)
HealthChecks::Result.new(false, message, shard: storage_name)
ensure
delete_test_file(tmp_file_path)
end
end
end
def metrics
repository_storages.flat_map do |storage_name|
tmp_file_path = tmp_file_path(storage_name)
[
operation_metrics(:filesystem_accessible, :filesystem_access_latency, -> { storage_stat_test(storage_name) }, shard: storage_name),
operation_metrics(:filesystem_writable, :filesystem_write_latency, -> { storage_write_test(tmp_file_path) }, shard: storage_name),
operation_metrics(:filesystem_readable, :filesystem_read_latency, -> { storage_read_test(tmp_file_path) }, shard: storage_name)
].flatten
end
end
private
RANDOM_STRING = SecureRandom.hex(1000).freeze
def operation_metrics(ok_metric, latency_metric, operation, **labels)
with_timing operation do |result, elapsed|
[
metric(latency_metric, elapsed, **labels),
metric(ok_metric, result ? 1 : 0, **labels)
]
end
rescue RuntimeError => ex
Rails.logger("unexpected error #{ex} when checking #{ok_metric}")
[metric(ok_metric, 0, **labels)]
end
def repository_storages
@repository_storage ||= Gitlab::CurrentSettings.current_application_settings.repository_storages
end
def storages_paths
@storage_paths ||= Gitlab.config.repositories.storages
end
def with_timeout(args)
%w{timeout 1}.concat(args)
end
def tmp_file_path(storage_name)
Dir::Tmpname.create(%w(fs_shards_check +deleted), path(storage_name)) { |path| path }
end
def path(storage_name)
storages_paths&.dig(storage_name, 'path')
end
def storage_stat_test(storage_name)
stat_path = File.join(path(storage_name), '.')
begin
_, status = Gitlab::Popen.popen(with_timeout(%W{ stat #{stat_path} }))
status == 0
rescue Errno::ENOENT
File.exist?(stat_path) && File::Stat.new(stat_path).readable?
end
end
def storage_write_test(tmp_path)
_, status = Gitlab::Popen.popen(with_timeout(%W{ tee #{tmp_path} })) do |stdin|
stdin.write(RANDOM_STRING)
end
status == 0
rescue Errno::ENOENT
written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT
written_bytes == RANDOM_STRING.length
end
def storage_read_test(tmp_path)
_, status = Gitlab::Popen.popen(with_timeout(%W{ diff #{tmp_path} - })) do |stdin|
stdin.write(RANDOM_STRING)
end
status == 0
rescue Errno::ENOENT
file_contents = File.read(tmp_path) rescue Errno::ENOENT
file_contents == RANDOM_STRING
end
def delete_test_file(tmp_path)
_, status = Gitlab::Popen.popen(with_timeout(%W{ rm -f #{tmp_path} }))
status == 0
rescue Errno::ENOENT
File.delete(tmp_path) rescue Errno::ENOENT
end
end
end
end
end
module Gitlab::HealthChecks
Metric = Struct.new(:name, :value, :labels)
end
module Gitlab
module HealthChecks
class RedisCheck
extend SimpleAbstractCheck
class << self
private
def metric_prefix
'redis_ping'
end
def is_successful?(result)
result == 'PONG'
end
def check
catch_timeout 10.seconds do
Gitlab::Redis.with(&:ping)
end
end
end
end
end
end
module Gitlab::HealthChecks
Result = Struct.new(:success, :message, :labels)
end
module Gitlab
module HealthChecks
module SimpleAbstractCheck
include BaseAbstractCheck
def readiness
check_result = check
if is_successful?(check_result)
HealthChecks::Result.new(true)
elsif check_result.is_a?(Timeout::Error)
HealthChecks::Result.new(false, "#{human_name} check timed out")
else
HealthChecks::Result.new(false, "unexpected #{human_name} check result: #{check_result}")
end
end
def metrics
with_timing method(:check) do |result, elapsed|
Rails.logger.error("#{human_name} check returned unexpected result #{result}") unless is_successful?(result)
[
metric("#{metric_prefix}_timeout", result.is_a?(Timeout::Error) ? 1 : 0),
metric("#{metric_prefix}_success", is_successful?(result) ? 1 : 0),
metric("#{metric_prefix}_latency", elapsed)
]
end
end
private
def metric_prefix
raise NotImplementedError
end
def is_successful?(result)
raise NotImplementedError
end
def check
raise NotImplementedError
end
end
end
end
require 'spec_helper'
describe HealthController do
include StubENV
let(:token) { current_application_settings.health_check_access_token }
let(:json_response) { JSON.parse(response.body) }
before do
stub_env('IN_MEMORY_APPLICATION_SETTINGS', 'false')
end
describe '#readiness' do
context 'authorization token provided' do
before do
request.headers['TOKEN'] = token
end
it 'returns proper response' do
get :readiness
expect(json_response['db_check']['status']).to eq('ok')
expect(json_response['redis_check']['status']).to eq('ok')
expect(json_response['fs_shards_check']['status']).to eq('ok')
expect(json_response['fs_shards_check']['labels']['shard']).to eq('default')
end
end
context 'without authorization token' do
it 'returns proper response' do
get :readiness
expect(response.status).to eq(404)
end
end
end
describe '#liveness' do
context 'authorization token provided' do
before do
request.headers['TOKEN'] = token
end
it 'returns proper response' do
get :liveness
expect(json_response['db_check']['status']).to eq('ok')
expect(json_response['redis_check']['status']).to eq('ok')
expect(json_response['fs_shards_check']['status']).to eq('ok')
end
end
context 'without authorization token' do
it 'returns proper response' do
get :liveness
expect(response.status).to eq(404)
end
end
end
describe '#metrics' do
context 'authorization token provided' do
before do
request.headers['TOKEN'] = token
end
it 'returns DB ping metrics' do
get :metrics
expect(response.body).to match(/^db_ping_timeout 0$/)
expect(response.body).to match(/^db_ping_success 1$/)
expect(response.body).to match(/^db_ping_latency [0-9\.]+$/)
end
it 'returns Redis ping metrics' do
get :metrics
expect(response.body).to match(/^redis_ping_timeout 0$/)
expect(response.body).to match(/^redis_ping_success 1$/)
expect(response.body).to match(/^redis_ping_latency [0-9\.]+$/)
end
it 'returns file system check metrics' do
get :metrics
expect(response.body).to match(/^filesystem_access_latency{shard="default"} [0-9\.]+$/)
expect(response.body).to match(/^filesystem_accessible{shard="default"} 1$/)
expect(response.body).to match(/^filesystem_write_latency{shard="default"} [0-9\.]+$/)
expect(response.body).to match(/^filesystem_writable{shard="default"} 1$/)
expect(response.body).to match(/^filesystem_read_latency{shard="default"} [0-9\.]+$/)
expect(response.body).to match(/^filesystem_readable{shard="default"} 1$/)
end
end
context 'without authorization token' do
it 'returns proper response' do
get :metrics
expect(response.status).to eq(404)
end
end
end
end
require 'spec_helper'
require_relative './simple_check_shared'
describe Gitlab::HealthChecks::DbCheck do
include_examples 'simple_check', 'db_ping', 'Db', '1'
end
require 'spec_helper'
describe Gitlab::HealthChecks::FsShardsCheck do
let(:metric_class) { Gitlab::HealthChecks::Metric }
let(:result_class) { Gitlab::HealthChecks::Result }
let(:repository_storages) { [:default] }
let(:tmp_dir) { Dir.mktmpdir }
let(:storages_paths) do
{
default: { path: tmp_dir }
}.with_indifferent_access
end
before do
allow(described_class).to receive(:repository_storages) { repository_storages }
allow(described_class).to receive(:storages_paths) { storages_paths }
end
after do
FileUtils.remove_entry_secure(tmp_dir) if Dir.exist?(tmp_dir)
end
shared_examples 'filesystem checks' do
describe '#readiness' do
subject { described_class.readiness }
context 'storage points to not existing folder' do
let(:storages_paths) do
{
default: { path: 'tmp/this/path/doesnt/exist' }
}.with_indifferent_access
end
it { is_expected.to include(result_class.new(false, 'cannot stat storage', shard: :default)) }
end
context 'storage points to directory that has both read and write rights' do
before do
FileUtils.chmod_R(0755, tmp_dir)
end
it { is_expected.to include(result_class.new(true, nil, shard: :default)) }
it 'cleans up files used for testing' do
expect(described_class).to receive(:storage_write_test).with(any_args).and_call_original
subject
expect(Dir.entries(tmp_dir).count).to eq(2)
end
context 'read test fails' do
before do
allow(described_class).to receive(:storage_read_test).with(any_args).and_return(false)
end
it { is_expected.to include(result_class.new(false, 'cannot read from storage', shard: :default)) }
end
context 'write test fails' do
before do
allow(described_class).to receive(:storage_write_test).with(any_args).and_return(false)
end
it { is_expected.to include(result_class.new(false, 'cannot write to storage', shard: :default)) }
end
end
end
describe '#metrics' do
subject { described_class.metrics }
context 'storage points to not existing folder' do
let(:storages_paths) do
{
default: { path: 'tmp/this/path/doesnt/exist' }
}.with_indifferent_access
end
it { is_expected.to include(metric_class.new(:filesystem_accessible, 0, shard: :default)) }
it { is_expected.to include(metric_class.new(:filesystem_readable, 0, shard: :default)) }
it { is_expected.to include(metric_class.new(:filesystem_writable, 0, shard: :default)) }
it { is_expected.to include(have_attributes(name: :filesystem_access_latency, value: be > 0, labels: { shard: :default })) }
it { is_expected.to include(have_attributes(name: :filesystem_read_latency, value: be > 0, labels: { shard: :default })) }
it { is_expected.to include(have_attributes(name: :filesystem_write_latency, value: be > 0, labels: { shard: :default })) }
end
context 'storage points to directory that has both read and write rights' do
before do
FileUtils.chmod_R(0755, tmp_dir)
end
it { is_expected.to include(metric_class.new(:filesystem_accessible, 1, shard: :default)) }
it { is_expected.to include(metric_class.new(:filesystem_readable, 1, shard: :default)) }
it { is_expected.to include(metric_class.new(:filesystem_writable, 1, shard: :default)) }
it { is_expected.to include(have_attributes(name: :filesystem_access_latency, value: be > 0, labels: { shard: :default })) }
it { is_expected.to include(have_attributes(name: :filesystem_read_latency, value: be > 0, labels: { shard: :default })) }
it { is_expected.to include(have_attributes(name: :filesystem_write_latency, value: be > 0, labels: { shard: :default })) }
end
end
end
context 'when popen always finds required binaries' do
before do
allow(Gitlab::Popen).to receive(:popen).and_wrap_original do |method, *args, &block|
begin
method.call(*args, &block)
rescue RuntimeError
raise 'expected not to happen'
end
end
end
it_behaves_like 'filesystem checks'
end
context 'when popen never finds required binaries' do
before do
allow(Gitlab::Popen).to receive(:popen).and_raise(Errno::ENOENT)
end
it_behaves_like 'filesystem checks'
end
end
require 'spec_helper'
require_relative './simple_check_shared'
describe Gitlab::HealthChecks::RedisCheck do
include_examples 'simple_check', 'redis_ping', 'Redis', 'PONG'
end
shared_context 'simple_check' do |metrics_prefix, check_name, success_result|
describe '#metrics' do
subject { described_class.metrics }
context 'Check is passing' do
before do
allow(described_class).to receive(:check).and_return success_result
end
it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_success", value: 1)) }
it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_timeout", value: 0)) }
it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_latency", value: be > 0)) }
end
context 'Check is misbehaving' do
before do
allow(described_class).to receive(:check).and_return 'error!'
end
it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_success", value: 0)) }
it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_timeout", value: 0)) }
it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_latency", value: be > 0)) }
end
context 'Check is timeouting' do
before do
allow(described_class).to receive(:check).and_return Timeout::Error.new
end
it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_success", value: 0)) }
it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_timeout", value: 1)) }
it { is_expected.to include(have_attributes(name: "#{metrics_prefix}_latency", value: be > 0)) }
end
end
describe '#readiness' do
subject { described_class.readiness }
context 'Check returns ok' do
before do
allow(described_class).to receive(:check).and_return success_result
end
it { is_expected.to have_attributes(success: true) }
end
context 'Check is misbehaving' do
before do
allow(described_class).to receive(:check).and_return 'error!'
end
it { is_expected.to have_attributes(success: false, message: "unexpected #{check_name} check result: error!") }
end
context 'Check is timeouting' do
before do
allow(described_class).to receive(:check ).and_return Timeout::Error.new
end
it { is_expected.to have_attributes(success: false, message: "#{check_name} check timed out") }
end
end
describe '#liveness' do
subject { described_class.readiness }
it { is_expected.to eq(Gitlab::HealthChecks::Result.new(true)) }
end
end
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment