BigW Consortium Gitlab

importer.rb 11.1 KB
Newer Older
Valery Sizov committed
1
module Gitlab
2
  module GithubImport
Valery Sizov committed
3
    class Importer
4 5
      include Gitlab::ShellAdapter

6
      attr_reader :errors, :project, :repo, :repo_url
Valery Sizov committed
7 8

      def initialize(project)
9 10
        @project  = project
        @repo     = project.import_source
11
        @repo_url = project.import_url
12
        @errors   = []
13
        @labels   = {}
14 15 16 17 18 19 20 21
      end

      def client
        return @client if defined?(@client)
        unless credentials
          raise Projects::ImportService::Error,
                "Unable to find project import data credentials for project ID: #{@project.id}"
        end
22

23 24
        opts = {}
        # Gitea plan to be GitHub compliant
25
        if project.gitea_import?
26 27 28 29 30 31
          uri = URI.parse(project.import_url)
          host = "#{uri.scheme}://#{uri.host}:#{uri.port}#{uri.path}".sub(%r{/?[\w-]+/[\w-]+\.git\z}, '')
          opts = {
            host: host,
            api_version: 'v1'
          }
James Lopez committed
32
        end
33 34

        @client = Client.new(credentials[:user], opts)
Valery Sizov committed
35 36 37
      end

      def execute
38 39 40 41 42 43 44 45
        # The ordering of importing is important here due to the way GitHub structures their data
        # 1. Labels are required by other items while not having a dependency on anything else
        # so need to be first
        # 2. Pull requests must come before issues. Every pull request is also an issue but not
        # all issues are pull requests. Only the issue entity has labels defined in GitHub. GitLab
        # doesn't structure data like this so we need to make sure that we've created the MRs
        # before we attempt to add the labels defined in the GitHub issue for the related, already
        # imported, pull request
46 47 48
        import_labels
        import_milestones
        import_pull_requests
49
        import_issues
50 51
        import_comments(:issues)
        import_comments(:pull_requests)
52
        import_wiki
53 54 55

        # Gitea doesn't have a Release API yet
        # See https://github.com/go-gitea/gitea/issues/330
56
        unless project.gitea_import?
57 58 59
          import_releases
        end

60 61 62
        handle_errors

        true
63 64 65 66
      end

      private

67
      def credentials
68 69 70
        return @credentials if defined?(@credentials)

        @credentials = project.import_data ? project.import_data.credentials : nil
James Lopez committed
71 72
      end

73
      def handle_errors
74 75 76 77 78 79
        return unless errors.any?

        project.update_column(:import_error, {
          message: 'The remote data could not be fully imported.',
          errors: errors
        }.to_json)
80 81
      end

82
      def import_labels
83
        fetch_resources(:labels, repo, per_page: 100) do |labels|
84 85
          labels.each do |raw|
            begin
86 87
              gh_label = LabelFormatter.new(project, raw)
              gh_label.create!
88
            rescue => e
89
              errors << { type: :label, url: Gitlab::UrlSanitizer.sanitize(gh_label.url), errors: e.message }
90
            end
91 92
          end
        end
93

94
        cache_labels!
95 96
      end

97
      def import_milestones
98
        fetch_resources(:milestones, repo, state: :all, per_page: 100) do |milestones|
99 100
          milestones.each do |raw|
            begin
101 102
              gh_milestone = MilestoneFormatter.new(project, raw)
              gh_milestone.create!
103
            rescue => e
104
              errors << { type: :milestone, url: Gitlab::UrlSanitizer.sanitize(gh_milestone.url), errors: e.message }
105
            end
106 107
          end
        end
108 109
      end

110
      def import_issues
111
        fetch_resources(:issues, repo, state: :all, sort: :created, direction: :asc, per_page: 100) do |issues|
112
          issues.each do |raw|
113
            gh_issue = IssueFormatter.new(project, raw, client)
114

115 116 117
            begin
              issuable =
                if gh_issue.pull_request?
118
                  MergeRequest.find_by(target_project_id: project.id, iid: gh_issue.number)
119 120 121 122 123 124
                else
                  gh_issue.create!
                end

              apply_labels(issuable, raw)
            rescue => e
125
              errors << { type: :issue, url: Gitlab::UrlSanitizer.sanitize(gh_issue.url), errors: e.message }
126
            end
Valery Sizov committed
127 128 129 130
          end
        end
      end

131
      def import_pull_requests
132
        fetch_resources(:pull_requests, repo, state: :all, sort: :created, direction: :asc, per_page: 100) do |pull_requests|
133
          pull_requests.each do |raw|
134 135
            gh_pull_request = PullRequestFormatter.new(project, raw, client)

136
            next unless gh_pull_request.valid?
137 138

            begin
139 140 141 142
              restore_source_branch(gh_pull_request) unless gh_pull_request.source_branch_exists?
              restore_target_branch(gh_pull_request) unless gh_pull_request.target_branch_exists?

              merge_request = gh_pull_request.create!
143

144
              # Gitea doesn't return PR in the Issue API endpoint, so labels must be assigned at this stage
145
              if project.gitea_import?
146 147
                apply_labels(merge_request, raw)
              end
148
            rescue => e
149
              errors << { type: :pull_request, url: Gitlab::UrlSanitizer.sanitize(gh_pull_request.url), errors: e.message }
150
            ensure
151
              clean_up_restored_branches(gh_pull_request)
152
            end
153
          end
154
        end
155 156

        project.repository.after_remove_branch
157 158
      end

159
      def restore_source_branch(pull_request)
160
        project.repository.create_branch(pull_request.source_branch_name, pull_request.source_branch_sha)
161
      end
162

163 164
      def restore_target_branch(pull_request)
        project.repository.create_branch(pull_request.target_branch_name, pull_request.target_branch_sha)
165 166
      end

167 168 169
      def remove_branch(name)
        project.repository.delete_branch(name)
      rescue Rugged::ReferenceError
170
        errors << { type: :remove_branch, name: name }
171 172 173
      end

      def clean_up_restored_branches(pull_request)
174 175
        return if pull_request.opened?

176 177
        remove_branch(pull_request.source_branch_name) unless pull_request.source_branch_exists?
        remove_branch(pull_request.target_branch_name) unless pull_request.target_branch_exists?
178 179
      end

180 181
      def apply_labels(issuable, raw)
        return unless raw.labels.count > 0
182

183 184 185
        label_ids = raw.labels
          .map { |attrs| @labels[attrs.name] }
          .compact
186

187
        issuable.update_attribute(:label_ids, label_ids)
188 189
      end

190 191
      def import_comments(issuable_type)
        resource_type = "#{issuable_type}_comments".to_sym
192

193 194 195 196 197 198 199
        # Two notes here:
        # 1. We don't have a distinctive attribute for comments (unlike issues iid), so we fetch the last inserted note,
        # compare it against every comment in the current imported page until we find match, and that's where start importing
        # 2. GH returns comments for _both_ issues and PRs through issues_comments API, while pull_requests_comments returns
        # only comments on diffs, so select last note not based on noteable_type but on line_code
        line_code_is = issuable_type == :pull_requests ? 'NOT NULL' : 'NULL'
        last_note    = project.notes.where("line_code IS #{line_code_is}").last
200

201
        fetch_resources(resource_type, repo, per_page: 100) do |comments|
202 203 204 205
          if last_note
            discard_inserted_comments(comments, last_note)
            last_note = nil
          end
206

207
          create_comments(comments)
208
        end
209
      end
Valery Sizov committed
210

211
      def create_comments(comments)
212 213 214
        ActiveRecord::Base.no_touching do
          comments.each do |raw|
            begin
215 216
              comment = CommentFormatter.new(project, raw, client)

217 218
              # GH does not return info about comment's parent, so we guess it by checking its URL!
              *_, parent, iid = URI(raw.html_url).path.split('/')
219 220 221 222 223 224

              issuable = if parent == 'issues'
                           Issue.find_by(project_id: project.id, iid: iid)
                         else
                           MergeRequest.find_by(target_project_id: project.id, iid: iid)
                         end
225

226 227
              next unless issuable

228 229 230 231
              issuable.notes.create!(comment.attributes)
            rescue => e
              errors << { type: :comment, url: Gitlab::UrlSanitizer.sanitize(raw.url), errors: e.message }
            end
232
          end
233
        end
Valery Sizov committed
234
      end
235

236 237 238 239
      def discard_inserted_comments(comments, last_note)
        last_note_attrs = nil

        cut_off_index = comments.find_index do |raw|
240
          comment           = CommentFormatter.new(project, raw)
241 242 243 244 245 246 247 248 249
          comment_attrs     = comment.attributes
          last_note_attrs ||= last_note.slice(*comment_attrs.keys)

          comment_attrs.with_indifferent_access == last_note_attrs
        end

        # No matching resource in the collection, which means we got halted right on the end of the last page, so all good
        return unless cut_off_index

Ahmad Sherif committed
250
        # Otherwise, remove the resources we've already inserted
251 252 253
        comments.shift(cut_off_index + 1)
      end

254
      def import_wiki
255
        unless project.wiki.repository_exists?
256
          wiki = WikiFormatter.new(project)
257
          gitlab_shell.import_repository(project.repository_storage_path, wiki.path_with_namespace, wiki.import_url)
258
        end
259
      rescue Gitlab::Shell::Error => e
260 261 262 263
        # GitHub error message when the wiki repo has not been created,
        # this means that repo has wiki enabled, but have no pages. So,
        # we can skip the import.
        if e.message !~ /repository not exported/
264
          errors << { type: :wiki, errors: e.message }
265
        end
266
      end
267 268

      def import_releases
269
        fetch_resources(:releases, repo, per_page: 100) do |releases|
270 271
          releases.each do |raw|
            begin
272
              gh_release = ReleaseFormatter.new(project, raw)
273 274
              gh_release.create! if gh_release.valid?
            rescue => e
275
              errors << { type: :release, url: Gitlab::UrlSanitizer.sanitize(gh_release.url), errors: e.message }
276
            end
277
          end
278 279 280
        end
      end

281
      def cache_labels!
282 283 284 285 286
        project.labels.select(:id, :title).find_each do |label|
          @labels[label.title] = label.id
        end
      end

287 288 289
      def fetch_resources(resource_type, *opts)
        return if imported?(resource_type)

290
        opts.last[:page] = current_page(resource_type)
291

292 293 294
        client.public_send(resource_type, *opts) do |resources|
          yield resources
          increment_page(resource_type)
295
        end
296

297
        imported!(resource_type)
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
      end

      def imported?(resource_type)
        Rails.cache.read("#{cache_key_prefix}:#{resource_type}:imported")
      end

      def imported!(resource_type)
        Rails.cache.write("#{cache_key_prefix}:#{resource_type}:imported", true, ex: 1.day)
      end

      def increment_page(resource_type)
        key = "#{cache_key_prefix}:#{resource_type}:current-page"

        # Rails.cache.increment calls INCRBY directly on the value stored under the key, which is
        # a serialized ActiveSupport::Cache::Entry, so it will return an error by Redis, hence this ugly work-around
        page = Rails.cache.read(key)
        page += 1
        Rails.cache.write(key, page)

        page
      end

      def current_page(resource_type)
        Rails.cache.fetch("#{cache_key_prefix}:#{resource_type}:current-page", ex: 1.day) { 1 }
      end

      def cache_key_prefix
        @cache_key_prefix ||= "github-import:#{project.id}"
326
      end
Valery Sizov committed
327 328 329
    end
  end
end