Preload pipeline data for project pipelines

When displaying the pipelines of a project we now preload the following data: 1. Authors of the commits that belong to these pipelines 2. The number of warnings per pipeline, which is used by Ci::Pipeline#has_warnings? == Commit Authors Previously this data was queried for every Commit separately, leading to 20 SQL queries being executed in the worst case. With an average of 3 to 5 milliseconds per SQL query this could result in 100 milliseconds being spent in _just_ getting Commit authors. To preload this data Commit#author now uses BatchLoader (through Commit#lazy_author), and a separate module Gitlab::Ci::Pipeline::Preloader is used to ensure all authors are loaded before they are used. == Number of warnings This changes Ci::Pipeline#has_warnings? so it supports preloading of the number of warnings per pipeline. This removes the need for executing a COUNT(*) query for every pipeline just to see if it has any warnings or not.

Preload pipeline data for project pipelines
19428e80 · Yorick Peterse · 70985aa1 · 19428e80 · 19428e80 · 19428e80
Unverified Commit 19428e80 authored May 07, 2018 by Yorick Peterse
7 changed files
--- a/app/controllers/projects/pipelines_controller.rb
+++ b/app/controllers/projects/pipelines_controller.rb
@@ -23,7 +23,7 @@ class Projects::PipelinesController < Projects::ApplicationController
    @finished_count = limited_pipelines_count(project, 'finished')
    @pipelines_count = limited_pipelines_count(project)

-    @pipelines.map(&:commit) # List commits for batch loading
+    Gitlab::Ci::Pipeline::Preloader.preload(@pipelines)

    respond_to do |format|
      format.html

--- a/app/models/ci/pipeline.rb
+++ b/app/models/ci/pipeline.rb
@@ -406,7 +406,18 @@ module Ci
    end

    def has_warnings?
-      builds.latest.failed_but_allowed.any?
+      number_of_warnings.positive?
+    end
+
+    def number_of_warnings
+      BatchLoader.for(id).batch(default_value: 0) do |pipeline_ids, loader|
+        Build.where(commit_id: pipeline_ids)
+          .latest
+          .failed_but_allowed
+          .group(:commit_id)
+          .count
+          .each { |id, amount| loader.call(id, amount) }
+      end
    end

    def set_config_source

--- a/app/models/commit.rb
+++ b/app/models/commit.rb
@@ -224,8 +224,34 @@ class Commit
    Gitlab::ClosingIssueExtractor.new(project, current_user).closed_by_message(safe_message)
  end

+  def lazy_author
+    BatchLoader.for(author_email.downcase).batch do |emails, loader|
+      # A Hash that maps user Emails to the corresponding User objects. The
+      # Emails at this point are the _primary_ Emails of the Users.
+      users_for_emails = User
+        .by_any_email(emails)
+        .each_with_object({}) { |user, hash| hash[user.email] = user }
+
+      users_for_ids = users_for_emails
+        .values
+        .each_with_object({}) { |user, hash| hash[user.id] = user }
+
+      # Some commits may have used an alternative Email address. In this case we
+      # need to query the "emails" table to map those addresses to User objects.
+      Email
+        .where(email: emails - users_for_emails.keys)
+        .pluck(:email, :user_id)
+        .each { |(email, id)| users_for_emails[email] = users_for_ids[id] }
+
+      users_for_emails.each { |email, user| loader.call(email, user) }
+    end
+  end
+
  def author
-    User.find_by_any_email(author_email.downcase)
+    # We use __sync so that we get the actual objects back (including an actual
+    # nil), instead of a wrapper, as returning a wrapped nil breaks a lot of
+    # code.
+    lazy_author.__sync
  end
  request_cache(:author) { author_email.downcase }


--- a/lib/gitlab/ci/pipeline/preloader.rb
+++ b/lib/gitlab/ci/pipeline/preloader.rb
+# frozen_string_literal: true
+
+module Gitlab
+  module Ci
+    module Pipeline
+      # Class for preloading data associated with pipelines such as commit
+      # authors.
+      module Preloader
+        def self.preload(pipelines)
+          # This ensures that all the pipeline commits are eager loaded before we
+          # start using them.
+          pipelines.each(&:commit)
+
+          pipelines.each do |pipeline|
+            # This preloads the author of every commit. We're using "lazy_author"
+            # here since "author" immediately loads the data on the first call.
+            pipeline.commit.try(:lazy_author)
+
+            # This preloads the number of warnings for every pipeline, ensuring
+            # that Ci::Pipeline#has_warnings? doesn't execute any additional
+            # queries.
+            pipeline.number_of_warnings
+          end
+        end
+      end
+    end
+  end
+end
--- a/spec/lib/gitlab/ci/pipeline/preloader_spec.rb
+++ b/spec/lib/gitlab/ci/pipeline/preloader_spec.rb
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+describe Gitlab::Ci::Pipeline::Preloader do
+  describe '.preload' do
+    it 'preloads the author of every pipeline commit' do
+      commit = double(:commit)
+      pipeline = double(:pipeline, commit: commit)
+
+      expect(commit)
+        .to receive(:lazy_author)
+
+      expect(pipeline)
+        .to receive(:number_of_warnings)
+
+      described_class.preload([pipeline])
+    end
+  end
+end
--- a/spec/models/ci/pipeline_spec.rb
+++ b/spec/models/ci/pipeline_spec.rb
@@ -774,6 +774,33 @@ describe Ci::Pipeline, :mailer do
    end
  end

+  describe '#number_of_warnings' do
+    it 'returns the number of warnings' do
+      create(:ci_build, :allowed_to_fail, :failed, pipeline: pipeline, name: 'rubocop')
+
+      expect(pipeline.number_of_warnings).to eq(1)
+    end
+
+    it 'supports eager loading of the number of warnings' do
+      pipeline2 = create(:ci_empty_pipeline, status: :created, project: project)
+
+      create(:ci_build, :allowed_to_fail, :failed, pipeline: pipeline, name: 'rubocop')
+      create(:ci_build, :allowed_to_fail, :failed, pipeline: pipeline2, name: 'rubocop')
+
+      pipelines = project.pipelines.to_a
+
+      pipelines.each(&:number_of_warnings)
+
+      # To run the queries we need to actually use the lazy objects, which we do
+      # by just sending "to_i" to them.
+      amount = ActiveRecord::QueryRecorder
+        .new { pipelines.each { |p| p.number_of_warnings.to_i } }
+        .count
+
+      expect(amount).to eq(1)
+    end
+  end
+
  shared_context 'with some outdated pipelines' do
    before do
      create_pipeline(:canceled, 'ref', 'A', project)

--- a/spec/models/commit_spec.rb
+++ b/spec/models/commit_spec.rb
@@ -52,22 +52,98 @@ describe Commit do
    end
  end

-  describe '#author' do
+  describe '#author', :request_store do
    it 'looks up the author in a case-insensitive way' do
      user = create(:user, email: commit.author_email.upcase)
      expect(commit.author).to eq(user)
    end

-    it 'caches the author', :request_store do
+    it 'caches the author' do
      user = create(:user, email: commit.author_email)
-      expect(User).to receive(:find_by_any_email).and_call_original

      expect(commit.author).to eq(user)
+
      key = "Commit:author:#{commit.author_email.downcase}"
-      expect(RequestStore.store[key]).to eq(user)

+      expect(RequestStore.store[key]).to eq(user)
      expect(commit.author).to eq(user)
    end
+
+    context 'using eager loading' do
+      let!(:alice) { create(:user, email: 'alice@example.com') }
+      let!(:bob) { create(:user, email: 'hunter2@example.com') }
+
+      let(:alice_commit) do
+        described_class.new(RepoHelpers.sample_commit, project).tap do |c|
+          c.author_email = 'alice@example.com'
+        end
+      end
+
+      let(:bob_commit) do
+        # The commit for Bob uses one of his alternative Emails, instead of the
+        # primary one.
+        described_class.new(RepoHelpers.sample_commit, project).tap do |c|
+          c.author_email = 'bob@example.com'
+        end
+      end
+
+      let(:eve_commit) do
+        described_class.new(RepoHelpers.sample_commit, project).tap do |c|
+          c.author_email = 'eve@example.com'
+        end
+      end
+
+      let!(:commits) { [alice_commit, bob_commit, eve_commit] }
+
+      before do
+        create(:email, user: bob, email: 'bob@example.com')
+      end
+
+      it 'executes only two SQL queries' do
+        recorder = ActiveRecord::QueryRecorder.new do
+          # Running this first ensures we don't run one query for every
+          # commit.
+          commits.each(&:lazy_author)
+
+          # This forces the execution of the SQL queries necessary to load the
+          # data.
+          commits.each { |c| c.author.try(:id) }
+        end
+
+        expect(recorder.count).to eq(2)
+      end
+
+      it "preloads the authors for Commits matching a user's primary Email" do
+        commits.each(&:lazy_author)
+
+        expect(alice_commit.author).to eq(alice)
+      end
+
+      it "preloads the authors for Commits using a User's alternative Email" do
+        commits.each(&:lazy_author)
+
+        expect(bob_commit.author).to eq(bob)
+      end
+
+      it 'sets the author to Nil if an author could not be found for a Commit' do
+        commits.each(&:lazy_author)
+
+        expect(eve_commit.author).to be_nil
+      end
+
+      it 'does not execute SQL queries once the authors are preloaded' do
+        commits.each(&:lazy_author)
+        commits.each { |c| c.author.try(:id) }
+
+        recorder = ActiveRecord::QueryRecorder.new do
+          alice_commit.author
+          bob_commit.author
+          eve_commit.author
+        end
+
+        expect(recorder.count).to be_zero
+      end
+    end
  end

  describe '#to_reference' do