BigW Consortium Gitlab

reply_parser.rb 2.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
# Inspired in great part by Discourse's Email::Receiver
module Gitlab
  module Email
    class ReplyParser
      attr_accessor :message

      def initialize(message)
        @message = message
      end

      def execute
        body = select_body(message)

        encoding = body.encoding

        body = discourse_email_trimmer(body)

        body = EmailReplyParser.parse_reply(body)

        body.force_encoding(encoding).encode("UTF-8")
      end

      private

      def select_body(message)
26 27
        if message.multipart?
          part = message.text_part || message.html_part || message
28
        else
29
          part = message
30
        end
31

32
        decoded = fix_charset(part)
33 34

        return "" unless decoded
35 36

        # Certain trigger phrases that means we didn't parse correctly
37
        if decoded =~ /(Content\-Type\:|multipart\/alternative|text\/plain)/
38 39 40
          return ""
        end

41
        if (part.content_type || '').include? 'text/html'
42 43 44 45
          HTMLParser.parse_reply(decoded)
        else
          decoded
        end
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
      end

      # Force encoding to UTF-8 on a Mail::Message or Mail::Part
      def fix_charset(object)
        return nil if object.nil?

        if object.charset
          object.body.decoded.force_encoding(object.charset.gsub(/utf8/i, "UTF-8")).encode("UTF-8").to_s
        else
          object.body.to_s
        end
      rescue
        nil
      end

      REPLYING_HEADER_LABELS = %w(From Sent To Subject Reply To Cc Bcc Date)
      REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |label| "#{label}:" })

      def discourse_email_trimmer(body)
        lines = body.scrub.lines.to_a
        range_end = 0

        lines.each_with_index do |l, idx|
          # This one might be controversial but so many reply lines have years, times and end with a colon.
Douwe Maan committed
70 71
          # Let's try it and see how well it works.
          break if (l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/) ||
72 73 74
                   (l =~ /On \w+ \d+,? \d+,?.*wrote:/)

          # Headers on subsequent lines
75
          break if (0..2).all? { |off| lines[idx + off] =~ REPLYING_HEADER_REGEX }
76 77 78 79 80 81 82 83 84 85 86
          # Headers on the same line
          break if REPLYING_HEADER_LABELS.count { |label| l.include?(label) } >= 3

          range_end = idx
        end

        lines[0..range_end].join.strip
      end
    end
  end
end