BigW Consortium Gitlab

fs_shards_check.rb 4.26 KB
Newer Older
1 2 3 4
module Gitlab
  module HealthChecks
    class FsShardsCheck
      extend BaseAbstractCheck
5 6 7
      RANDOM_STRING = SecureRandom.hex(1000).freeze
      COMMAND_TIMEOUT = '1'.freeze
      TIMEOUT_EXECUTABLE = 'timeout'.freeze
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66

      class << self
        def readiness
          repository_storages.map do |storage_name|
            begin
              tmp_file_path = tmp_file_path(storage_name)

              if !storage_stat_test(storage_name)
                HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
              elsif !storage_write_test(tmp_file_path)
                HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
              elsif !storage_read_test(tmp_file_path)
                HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
              else
                HealthChecks::Result.new(true, nil, shard: storage_name)
              end
            rescue RuntimeError => ex
              message = "unexpected error #{ex} when checking storage #{storage_name}"
              Rails.logger.error(message)
              HealthChecks::Result.new(false, message, shard: storage_name)
            ensure
              delete_test_file(tmp_file_path)
            end
          end
        end

        def metrics
          repository_storages.flat_map do |storage_name|
            tmp_file_path = tmp_file_path(storage_name)
            [
              operation_metrics(:filesystem_accessible, :filesystem_access_latency, -> { storage_stat_test(storage_name) }, shard: storage_name),
              operation_metrics(:filesystem_writable, :filesystem_write_latency, -> { storage_write_test(tmp_file_path) }, shard: storage_name),
              operation_metrics(:filesystem_readable, :filesystem_read_latency, -> { storage_read_test(tmp_file_path) }, shard: storage_name)
            ].flatten
          end
        end

        private

        def operation_metrics(ok_metric, latency_metric, operation, **labels)
          with_timing operation do |result, elapsed|
            [
              metric(latency_metric, elapsed, **labels),
              metric(ok_metric, result ? 1 : 0, **labels)
            ]
          end
        rescue RuntimeError => ex
          Rails.logger("unexpected error #{ex} when checking #{ok_metric}")
          [metric(ok_metric, 0, **labels)]
        end

        def repository_storages
          @repository_storage ||= Gitlab::CurrentSettings.current_application_settings.repository_storages
        end

        def storages_paths
          @storage_paths ||= Gitlab.config.repositories.storages
        end

67
        def exec_with_timeout(cmd_args, *args, &block)
68
          Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block)
69 70 71 72 73 74 75 76 77 78 79 80 81
        end

        def tmp_file_path(storage_name)
          Dir::Tmpname.create(%w(fs_shards_check +deleted), path(storage_name)) { |path| path }
        end

        def path(storage_name)
          storages_paths&.dig(storage_name, 'path')
        end

        def storage_stat_test(storage_name)
          stat_path = File.join(path(storage_name), '.')
          begin
82
            _, status = exec_with_timeout(%W{ stat #{stat_path} })
83 84 85 86 87 88 89
            status == 0
          rescue Errno::ENOENT
            File.exist?(stat_path) && File::Stat.new(stat_path).readable?
          end
        end

        def storage_write_test(tmp_path)
90
          _, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin|
91 92 93 94 95 96 97 98 99
            stdin.write(RANDOM_STRING)
          end
          status == 0
        rescue Errno::ENOENT
          written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT
          written_bytes == RANDOM_STRING.length
        end

        def storage_read_test(tmp_path)
100
          _, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin|
101 102 103 104 105 106 107 108 109
            stdin.write(RANDOM_STRING)
          end
          status == 0
        rescue Errno::ENOENT
          file_contents = File.read(tmp_path) rescue Errno::ENOENT
          file_contents == RANDOM_STRING
        end

        def delete_test_file(tmp_path)
110
          _, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
111 112 113 114 115 116 117 118
          status == 0
        rescue Errno::ENOENT
          File.delete(tmp_path) rescue Errno::ENOENT
        end
      end
    end
  end
end