BigW Consortium Gitlab

html_parser.rb 993 Bytes
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
module Gitlab
  module Email
    class HTMLParser
      def self.parse_reply(raw_body)
        new(raw_body).filtered_text
      end

      attr_reader :raw_body
      def initialize(raw_body)
        @raw_body = raw_body
      end

      def document
14
        @document ||= Nokogiri::HTML.parse(raw_body)
15 16 17
      end

      def filter_replies!
18 19
        document.xpath('//blockquote').each(&:remove)
        document.xpath('//table').each(&:remove)
20 21 22 23 24 25 26

        # bogus links with no href are sometimes added by outlook,
        # and can result in Html2Text adding extra square brackets
        # to the text, so we unwrap them here.
        document.xpath('//a[not(@href)]').each do |link|
          link.replace(link.children)
        end
27 28 29
      end

      def filtered_html
30 31 32 33
        @filtered_html ||= begin
          filter_replies!
          document.inner_html
        end
34 35 36 37 38 39 40 41
      end

      def filtered_text
        @filtered_text ||= Html2Text.convert(filtered_html)
      end
    end
  end
end