BigW Consortium Gitlab

fs_shards_check.rb 6.13 KB
Newer Older
1 2 3 4
module Gitlab
  module HealthChecks
    class FsShardsCheck
      extend BaseAbstractCheck
5 6 7
      RANDOM_STRING = SecureRandom.hex(1000).freeze
      COMMAND_TIMEOUT = '1'.freeze
      TIMEOUT_EXECUTABLE = 'timeout'.freeze
8 9 10 11 12

      class << self
        def readiness
          repository_storages.map do |storage_name|
            begin
13 14 15
              if !storage_circuitbreaker_test(storage_name)
                HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name)
              elsif !storage_stat_test(storage_name)
16 17
                HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
              else
18 19 20 21 22 23 24 25 26
                with_temp_file(storage_name) do |tmp_file_path|
                  if !storage_write_test(tmp_file_path)
                    HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
                  elsif !storage_read_test(tmp_file_path)
                    HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
                  else
                    HealthChecks::Result.new(true, nil, shard: storage_name)
                  end
                end
27 28 29 30 31 32 33 34 35 36
              end
            rescue RuntimeError => ex
              message = "unexpected error #{ex} when checking storage #{storage_name}"
              Rails.logger.error(message)
              HealthChecks::Result.new(false, message, shard: storage_name)
            end
          end
        end

        def metrics
37 38 39 40
          repository_storages.flat_map do |storage_name|
            [
              storage_stat_metrics(storage_name),
              storage_write_metrics(storage_name),
41 42
              storage_read_metrics(storage_name),
              storage_circuitbreaker_metrics(storage_name)
43
            ].flatten
44 45 46 47 48
          end
        end

        private

49 50 51 52 53 54
        def operation_metrics(ok_metric, latency_metric, **labels)
          result, elapsed = yield
          [
            metric(latency_metric, elapsed, **labels),
            metric(ok_metric, result ? 1 : 0, **labels)
          ]
55
        rescue RuntimeError => ex
56
          Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}")
57 58 59 60
          [metric(ok_metric, 0, **labels)]
        end

        def repository_storages
61
          storages_paths.keys
62 63 64
        end

        def storages_paths
65
          Gitlab.config.repositories.storages
66 67
        end

68
        def exec_with_timeout(cmd_args, *args, &block)
69
          Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block)
70 71
        end

72
        def with_temp_file(storage_name)
73 74 75 76
          temp_file_path = Dir::Tmpname.create(%w(fs_shards_check +deleted), storage_path(storage_name)) { |path| path }
          yield temp_file_path
        ensure
          delete_test_file(temp_file_path)
77 78
        end

79
        def storage_path(storage_name)
80 81 82
          storages_paths&.dig(storage_name, 'path')
        end

83 84 85 86 87 88 89
        # All below test methods use shell commands to perform actions on storage volumes.
        # In case a storage volume have connectivity problems causing pure Ruby IO operation to wait indefinitely,
        # we can rely on shell commands to be terminated once `timeout` kills them.
        #
        # However we also fallback to pure Ruby file operations in case a specific shell command is missing
        # so we are still able to perform healthchecks and gather metrics from such system.

90 91
        def delete_test_file(tmp_path)
          _, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
92
          status.zero?
93 94 95 96
        rescue Errno::ENOENT
          File.delete(tmp_path) rescue Errno::ENOENT
        end

97
        def storage_stat_test(storage_name)
98
          stat_path = File.join(storage_path(storage_name), '.')
99
          begin
100
            _, status = exec_with_timeout(%W{ stat #{stat_path} })
101
            status.zero?
102 103 104 105 106 107
          rescue Errno::ENOENT
            File.exist?(stat_path) && File::Stat.new(stat_path).readable?
          end
        end

        def storage_write_test(tmp_path)
108
          _, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin|
109 110
            stdin.write(RANDOM_STRING)
          end
111
          status.zero?
112 113 114 115 116 117
        rescue Errno::ENOENT
          written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT
          written_bytes == RANDOM_STRING.length
        end

        def storage_read_test(tmp_path)
118
          _, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin|
119 120
            stdin.write(RANDOM_STRING)
          end
121
          status.zero?
122 123 124 125 126
        rescue Errno::ENOENT
          file_contents = File.read(tmp_path) rescue Errno::ENOENT
          file_contents == RANDOM_STRING
        end

127
        def storage_circuitbreaker_test(storage_name)
128
          Gitlab::Git::Storage::CircuitBreaker.build(storage_name).perform { "OK" }
129 130 131 132
        rescue Gitlab::Git::Storage::Inaccessible
          nil
        end

133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
        def storage_stat_metrics(storage_name)
          operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
            with_timing { storage_stat_test(storage_name) }
          end
        end

        def storage_write_metrics(storage_name)
          operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, shard: storage_name) do
            with_temp_file(storage_name) do |tmp_file_path|
              with_timing { storage_write_test(tmp_file_path) }
            end
          end
        end

        def storage_read_metrics(storage_name)
          operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, shard: storage_name) do
            with_temp_file(storage_name) do |tmp_file_path|
              storage_write_test(tmp_file_path) # writes data used by read test
              with_timing { storage_read_test(tmp_file_path) }
            end
          end
154
        end
155 156 157 158 159 160 161 162

        def storage_circuitbreaker_metrics(storage_name)
          operation_metrics(:filesystem_circuitbreaker,
                            :filesystem_circuitbreaker_latency_seconds,
                            shard: storage_name) do
            with_timing { storage_circuitbreaker_test(storage_name) }
          end
        end
163 164 165 166
      end
    end
  end
end