BigW Consortium Gitlab

reply_parser.rb 2.15 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
# Inspired in great part by Discourse's Email::Receiver
module Gitlab
  module Email
    class ReplyParser
      attr_accessor :message

      def initialize(message)
        @message = message
      end

      def execute
        body = select_body(message)

        encoding = body.encoding

        body = discourse_email_trimmer(body)

        body = EmailReplyParser.parse_reply(body)

        body.force_encoding(encoding).encode("UTF-8")
      end

      private

      def select_body(message)
Douwe Maan committed
26 27
        text    = message.text_part if message.multipart?
        text  ||= message           if message.content_type !~ /text\/html/
28

Douwe Maan committed
29
        return "" unless text
30

Douwe Maan committed
31
        text = fix_charset(text)
32 33

        # Certain trigger phrases that means we didn't parse correctly
Douwe Maan committed
34
        if text =~ /(Content\-Type\:|multipart\/alternative|text\/plain)/
35 36 37
          return ""
        end

Douwe Maan committed
38
        text
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
      end

      # Force encoding to UTF-8 on a Mail::Message or Mail::Part
      def fix_charset(object)
        return nil if object.nil?

        if object.charset
          object.body.decoded.force_encoding(object.charset.gsub(/utf8/i, "UTF-8")).encode("UTF-8").to_s
        else
          object.body.to_s
        end
      rescue
        nil
      end

      REPLYING_HEADER_LABELS = %w(From Sent To Subject Reply To Cc Bcc Date)
      REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |label| "#{label}:" })

      def discourse_email_trimmer(body)
        lines = body.scrub.lines.to_a
        range_end = 0

        lines.each_with_index do |l, idx|
          # This one might be controversial but so many reply lines have years, times and end with a colon.
Douwe Maan committed
63 64
          # Let's try it and see how well it works.
          break if (l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/) ||
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
                   (l =~ /On \w+ \d+,? \d+,?.*wrote:/)

          # Headers on subsequent lines
          break if (0..2).all? { |off| lines[idx+off] =~ REPLYING_HEADER_REGEX }
          # Headers on the same line
          break if REPLYING_HEADER_LABELS.count { |label| l.include?(label) } >= 3

          range_end = idx
        end

        lines[0..range_end].join.strip
      end
    end
  end
end