[PATCH v2 3/6] parser: Move mail parsing into another module

Thu Dec 1 05:42:44 AEDT 2016

In '3c2ee2d', '6911241', and 'f995297', the parsing of mails and
building of Patch, Cover Letter and Comment objects was centralized in
'parser'. This was done to allow the 'parsemail' script to become a
'parsemail' management command and ease the development of a future
'/upload' endpoint which would separate the receiving of mails from the
parsing/displaying of same. While these changes are valid, they did
remove a 'main' function previously found in the 'parser' module. This
function allowed users to run the parser without actually adding a patch
to the database. As this was assumed to be unnecessary it was removed,
but it is in fact required by the Git post-receive hook found in
'tools'.

Move the code previously found in 'parser' to a more appropriately named
'extractor' function. This is a mere copy-paste and the missing 'main'
method will be added a follow-up patch.

Signed-off-by: Stephen Finucane <stephen at that.guru>
Cc: Paul Jakma <paul at jakma.org>
Cc: Tom Rini <trini at konsulko.com>
---
 patchwork/extractor.py         | 298 +++++++++++++++++++++++++++++++++++++++++
 patchwork/models.py            |  52 +------
 patchwork/parser.py            | 232 +-------------------------------
 patchwork/tests/test_parser.py |   2 +-
 4 files changed, 306 insertions(+), 278 deletions(-)
 create mode 100644 patchwork/extractor.py

diff --git a/patchwork/extractor.py b/patchwork/extractor.py
new file mode 100644
index 0000000..74fe4ab
--- /dev/null
+++ b/patchwork/extractor.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python
+#
+# Patchwork - automated patch tracking system
+# Copyright (C) 2008 Jeremy Kerr <jk at ozlabs.org>
+#
+# This file is part of the Patchwork package.
+#
+# Patchwork is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# Patchwork is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Patchwork; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""Extract comments and (optional) diffs from mbox files."""
+
+import codecs
+import hashlib
+import re
+
+from django.utils import six
+
+HUNK_RE = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
+FILENAME_RE = re.compile(r'^(---|\+\+\+) (\S+)')
+
+
+def parse_patch(content):
+    """Split a mail's contents into a diff and comment.
+
+    This is a state machine that takes a patch, generally in UNIX mbox
+    format, and splits it into the component comments and diff.
+
+    Args:
+        patch: The patch to be split
+
+    Returns:
+        A tuple containing the diff and comment. Either one or both of
+        these can be empty.
+
+    Raises:
+        Exception: The state machine transitioned to an invalid state.
+    """
+    patchbuf = ''
+    commentbuf = ''
+    buf = ''
+
+    # state specified the line we just saw, and what to expect next
+    state = 0
+    # 0: text
+    # 1: suspected patch header (diff, ====, Index:)
+    # 2: patch header line 1 (---)
+    # 3: patch header line 2 (+++)
+    # 4: patch hunk header line (@@ line)
+    # 5: patch hunk content
+    # 6: patch meta header (rename from/rename to)
+    #
+    # valid transitions:
+    #  0 -> 1 (diff, ===, Index:)
+    #  0 -> 2 (---)
+    #  1 -> 2 (---)
+    #  2 -> 3 (+++)
+    #  3 -> 4 (@@ line)
+    #  4 -> 5 (patch content)
+    #  5 -> 1 (run out of lines from @@-specifed count)
+    #  1 -> 6 (rename from / rename to)
+    #  6 -> 2 (---)
+    #  6 -> 1 (other text)
+    #
+    # Suspected patch header is stored into buf, and appended to
+    # patchbuf if we find a following hunk. Otherwise, append to
+    # comment after parsing.
+
+    # line counts while parsing a patch hunk
+    lc = (0, 0)
+    hunk = 0
+
+    for line in content.split('\n'):
+        line += '\n'
+
+        if state == 0:
+            if line.startswith('diff ') or line.startswith('===') \
+                    or line.startswith('Index: '):
+                state = 1
+                buf += line
+            elif line.startswith('--- '):
+                state = 2
+                buf += line
+            else:
+                commentbuf += line
+        elif state == 1:
+            buf += line
+            if line.startswith('--- '):
+                state = 2
+
+            if line.startswith(('rename from ', 'rename to ')):
+                state = 6
+        elif state == 2:
+            if line.startswith('+++ '):
+                state = 3
+                buf += line
+            elif hunk:
+                state = 1
+                buf += line
+            else:
+                state = 0
+                commentbuf += buf + line
+                buf = ''
+        elif state == 3:
+            match = HUNK_RE.match(line)
+            if match:
+                def fn(x):
+                    if not x:
+                        return 1
+                    return int(x)
+
+                lc = [fn(x) for x in match.groups()]
+
+                state = 4
+                patchbuf += buf + line
+                buf = ''
+            elif line.startswith('--- '):
+                patchbuf += buf + line
+                buf = ''
+                state = 2
+            elif hunk and line.startswith(r'\ No newline at end of file'):
+                # If we had a hunk and now we see this, it's part of the patch,
+                # and we're still expecting another @@ line.
+                patchbuf += line
+            elif hunk:
+                state = 1
+                buf += line
+            else:
+                state = 0
+                commentbuf += buf + line
+                buf = ''
+        elif state in [4, 5]:
+            if line.startswith('-'):
+                lc[0] -= 1
+            elif line.startswith('+'):
+                lc[1] -= 1
+            elif line.startswith(r'\ No newline at end of file'):
+                # Special case: Not included as part of the hunk's line count
+                pass
+            else:
+                lc[0] -= 1
+                lc[1] -= 1
+
+            patchbuf += line
+
+            if lc[0] <= 0 and lc[1] <= 0:
+                state = 3
+                hunk += 1
+            else:
+                state = 5
+        elif state == 6:
+            if line.startswith(('rename to ', 'rename from ')):
+                patchbuf += buf + line
+                buf = ''
+            elif line.startswith('--- '):
+                patchbuf += buf + line
+                buf = ''
+                state = 2
+            else:
+                buf += line
+                state = 1
+        else:
+            raise Exception("Unknown state %d! (line '%s')" % (state, line))
+
+    commentbuf += buf
+
+    if patchbuf == '':
+        patchbuf = None
+
+    if commentbuf == '':
+        commentbuf = None
+
+    return patchbuf, commentbuf
+
+
+def clean_content(content):
+    """Remove cruft from the email message.
+
+    Catch signature (-- ) and list footer (_____) cruft.
+    """
+    sig_re = re.compile(r'^(-- |_+)\n.*', re.S | re.M)
+
+    content = sig_re.sub('', content)
+
+    return content.strip()
+
+
+def find_content(mail):
+    """Extract a comment and potential diff from a mail."""
+    patchbuf = None
+    commentbuf = ''
+
+    for part in mail.walk():
+        if part.get_content_maintype() != 'text':
+            continue
+
+        payload = part.get_payload(decode=True)
+        subtype = part.get_content_subtype()
+
+        if not isinstance(payload, six.text_type):
+            charset = part.get_content_charset()
+
+            # Check that we have a charset that we understand. Otherwise,
+            # ignore it and fallback to our standard set.
+            if charset is not None:
+                try:
+                    codecs.lookup(charset)
+                except LookupError:
+                    charset = None
+
+            # If there is no charset or if it is unknown, then try some common
+            # charsets before we fail.
+            if charset is None:
+                try_charsets = ['utf-8', 'windows-1252', 'iso-8859-1']
+            else:
+                try_charsets = [charset]
+
+            for cset in try_charsets:
+                try:
+                    payload = six.text_type(payload, cset)
+                    break
+                except UnicodeDecodeError:
+                    payload = None
+
+            # Could not find a valid decoded payload.  Fail.
+            if payload is None:
+                return None, None
+
+        if subtype in ['x-patch', 'x-diff']:
+            patchbuf = payload
+        elif subtype == 'plain':
+            c = payload
+
+            if not patchbuf:
+                patchbuf, c = parse_patch(payload)
+
+            if c is not None:
+                commentbuf += c.strip() + '\n'
+
+    commentbuf = clean_content(commentbuf)
+
+    return patchbuf, commentbuf
+
+
+def hash_diff(diff):
+    """Generate a hash from a diff."""
+    # normalise spaces
+    diff = diff.replace('\r', '')
+    diff = diff.strip() + '\n'
+
+    prefixes = ['-', '+', ' ']
+    hash = hashlib.sha1()
+
+    for line in diff.split('\n'):
+        if len(line) <= 0:
+            continue
+
+        hunk_match = HUNK_RE.match(line)
+        filename_match = FILENAME_RE.match(line)
+
+        if filename_match:
+            # normalise -p1 top-directories
+            if filename_match.group(1) == '---':
+                filename = 'a/'
+            else:
+                filename = 'b/'
+            filename += '/'.join(filename_match.group(2).split('/')[1:])
+
+            line = filename_match.group(1) + ' ' + filename
+        elif hunk_match:
+            # remove line numbers, but leave line counts
+            def fn(x):
+                if not x:
+                    return 1
+                return int(x)
+            line_nos = list(map(fn, hunk_match.groups()))
+            line = '@@ -%d +%d @@' % tuple(line_nos)
+        elif line[0] in prefixes:
+            # if we have a +, - or context line, leave as-is
+            pass
+        else:
+            # other lines are ignored
+            continue
+
+        hash.update((line + '\n').encode('utf-8'))
+
+    return hash
diff --git a/patchwork/models.py b/patchwork/models.py
index 15a2936..19a5789 100644
--- a/patchwork/models.py
+++ b/patchwork/models.py
@@ -22,7 +22,6 @@ from __future__ import absolute_import
 
 from collections import Counter, OrderedDict
 import datetime
-import hashlib
 import random
 import re
 
@@ -34,6 +33,7 @@ from django.db import models
 from django.utils.encoding import python_2_unicode_compatible
 from django.utils.functional import cached_property
 
+from patchwork import extractor
 from patchwork.fields import HashField
 
 
@@ -366,54 +366,6 @@ class Patch(SeriesMixin, Submission):
 
         return counts
 
-    @staticmethod
-    def hash_diff(diff):
-        """Generate a hash from a diff."""
-        hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
-        filename_re = re.compile(r'^(---|\+\+\+) (\S+)')
-
-        # normalise spaces
-        diff = diff.replace('\r', '')
-        diff = diff.strip() + '\n'
-
-        prefixes = ['-', '+', ' ']
-        hash = hashlib.sha1()
-
-        for line in diff.split('\n'):
-            if len(line) <= 0:
-                continue
-
-            hunk_match = hunk_re.match(line)
-            filename_match = filename_re.match(line)
-
-            if filename_match:
-                # normalise -p1 top-directories
-                if filename_match.group(1) == '---':
-                    filename = 'a/'
-                else:
-                    filename = 'b/'
-                filename += '/'.join(filename_match.group(2).split('/')[1:])
-
-                line = filename_match.group(1) + ' ' + filename
-            elif hunk_match:
-                # remove line numbers, but leave line counts
-                def fn(x):
-                    if not x:
-                        return 1
-                    return int(x)
-                line_nos = list(map(fn, hunk_match.groups()))
-                line = '@@ -%d +%d @@' % tuple(line_nos)
-            elif line[0] in prefixes:
-                # if we have a +, - or context line, leave as-is
-                pass
-            else:
-                # other lines are ignored
-                continue
-
-            hash.update((line + '\n').encode('utf-8'))
-
-        return hash
-
     def _set_tag(self, tag, count):
         if count == 0:
             self.patchtag_set.filter(tag=tag).delete()
@@ -441,7 +393,7 @@ class Patch(SeriesMixin, Submission):
             self.state = get_default_initial_patch_state()
 
         if self.hash is None and self.diff is not None:
-            self.hash = self.hash_diff(self.diff).hexdigest()
+            self.hash = extractor.hash_diff(self.diff).hexdigest()
 
         super(Patch, self).save(**kwargs)
 
diff --git a/patchwork/parser.py b/patchwork/parser.py
index 9d1b79e..3c2b89b 100644
--- a/patchwork/parser.py
+++ b/patchwork/parser.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python
-#
 # Patchwork - automated patch tracking system
 # Copyright (C) 2008 Jeremy Kerr <jk at ozlabs.org>
 #
@@ -19,7 +17,8 @@
 # along with Patchwork; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
-import codecs
+"""Parse mbox files and add to Patchwork."""
+
 import datetime
 from email.header import decode_header
 from email.header import make_header
@@ -32,6 +31,8 @@ import re
 from django.contrib.auth.models import User
 from django.utils import six
 
+from patchwork.extractor import find_content
+from patchwork.extractor import FILENAME_RE
 from patchwork.models import Comment
 from patchwork.models import CoverLetter
 from patchwork.models import DelegationRule
@@ -46,8 +47,6 @@ from patchwork.models import State
 from patchwork.models import Submission
 
 
-_hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
-_filename_re = re.compile(r'^(---|\+\+\+) (\S+)')
 list_id_headers = ['List-ID', 'X-Mailing-List', 'X-list']
 
 logger = logging.getLogger(__name__)
@@ -342,63 +341,6 @@ def parse_version(subject, subject_prefixes):
     return 1
 
 
-def find_content(mail):
-    """Extract a comment and potential diff from a mail."""
-    patchbuf = None
-    commentbuf = ''
-
-    for part in mail.walk():
-        if part.get_content_maintype() != 'text':
-            continue
-
-        payload = part.get_payload(decode=True)
-        subtype = part.get_content_subtype()
-
-        if not isinstance(payload, six.text_type):
-            charset = part.get_content_charset()
-
-            # Check that we have a charset that we understand. Otherwise,
-            # ignore it and fallback to our standard set.
-            if charset is not None:
-                try:
-                    codecs.lookup(charset)
-                except LookupError:
-                    charset = None
-
-            # If there is no charset or if it is unknown, then try some common
-            # charsets before we fail.
-            if charset is None:
-                try_charsets = ['utf-8', 'windows-1252', 'iso-8859-1']
-            else:
-                try_charsets = [charset]
-
-            for cset in try_charsets:
-                try:
-                    payload = six.text_type(payload, cset)
-                    break
-                except UnicodeDecodeError:
-                    payload = None
-
-            # Could not find a valid decoded payload.  Fail.
-            if payload is None:
-                return None, None
-
-        if subtype in ['x-patch', 'x-diff']:
-            patchbuf = payload
-        elif subtype == 'plain':
-            c = payload
-
-            if not patchbuf:
-                patchbuf, c = parse_patch(payload)
-
-            if c is not None:
-                commentbuf += c.strip() + '\n'
-
-    commentbuf = clean_content(commentbuf)
-
-    return patchbuf, commentbuf
-
-
 def find_submission_for_comment(project, refs):
     for ref in refs:
         # first, check for a direct reply
@@ -502,170 +444,6 @@ def subject_check(subject):
     return comment_re.match(clean_header(subject))
 
 
-def clean_content(content):
-    """Remove cruft from the email message.
-
-    Catch signature (-- ) and list footer (_____) cruft.
-    """
-    sig_re = re.compile(r'^(-- |_+)\n.*', re.S | re.M)
-    content = sig_re.sub('', content)
-
-    return content.strip()
-
-
-def parse_patch(content):
-    """Split a mail's contents into a diff and comment.
-
-    This is a state machine that takes a patch, generally in UNIX mbox
-    format, and splits it into the component comments and diff.
-
-    Args:
-        patch: The patch to be split
-
-    Returns:
-        A tuple containing the diff and comment. Either one or both of
-        these can be empty.
-
-    Raises:
-        Exception: The state machine transitioned to an invalid state.
-    """
-    patchbuf = ''
-    commentbuf = ''
-    buf = ''
-
-    # state specified the line we just saw, and what to expect next
-    state = 0
-    # 0: text
-    # 1: suspected patch header (diff, ====, Index:)
-    # 2: patch header line 1 (---)
-    # 3: patch header line 2 (+++)
-    # 4: patch hunk header line (@@ line)
-    # 5: patch hunk content
-    # 6: patch meta header (rename from/rename to)
-    #
-    # valid transitions:
-    #  0 -> 1 (diff, ===, Index:)
-    #  0 -> 2 (---)
-    #  1 -> 2 (---)
-    #  2 -> 3 (+++)
-    #  3 -> 4 (@@ line)
-    #  4 -> 5 (patch content)
-    #  5 -> 1 (run out of lines from @@-specifed count)
-    #  1 -> 6 (rename from / rename to)
-    #  6 -> 2 (---)
-    #  6 -> 1 (other text)
-    #
-    # Suspected patch header is stored into buf, and appended to
-    # patchbuf if we find a following hunk. Otherwise, append to
-    # comment after parsing.
-
-    # line counts while parsing a patch hunk
-    lc = (0, 0)
-    hunk = 0
-
-    for line in content.split('\n'):
-        line += '\n'
-
-        if state == 0:
-            if line.startswith('diff ') or line.startswith('===') \
-                    or line.startswith('Index: '):
-                state = 1
-                buf += line
-            elif line.startswith('--- '):
-                state = 2
-                buf += line
-            else:
-                commentbuf += line
-        elif state == 1:
-            buf += line
-            if line.startswith('--- '):
-                state = 2
-
-            if line.startswith(('rename from ', 'rename to ')):
-                state = 6
-        elif state == 2:
-            if line.startswith('+++ '):
-                state = 3
-                buf += line
-            elif hunk:
-                state = 1
-                buf += line
-            else:
-                state = 0
-                commentbuf += buf + line
-                buf = ''
-        elif state == 3:
-            match = _hunk_re.match(line)
-            if match:
-                def fn(x):
-                    if not x:
-                        return 1
-                    return int(x)
-
-                lc = [fn(x) for x in match.groups()]
-
-                state = 4
-                patchbuf += buf + line
-                buf = ''
-            elif line.startswith('--- '):
-                patchbuf += buf + line
-                buf = ''
-                state = 2
-            elif hunk and line.startswith(r'\ No newline at end of file'):
-                # If we had a hunk and now we see this, it's part of the patch,
-                # and we're still expecting another @@ line.
-                patchbuf += line
-            elif hunk:
-                state = 1
-                buf += line
-            else:
-                state = 0
-                commentbuf += buf + line
-                buf = ''
-        elif state in [4, 5]:
-            if line.startswith('-'):
-                lc[0] -= 1
-            elif line.startswith('+'):
-                lc[1] -= 1
-            elif line.startswith(r'\ No newline at end of file'):
-                # Special case: Not included as part of the hunk's line count
-                pass
-            else:
-                lc[0] -= 1
-                lc[1] -= 1
-
-            patchbuf += line
-
-            if lc[0] <= 0 and lc[1] <= 0:
-                state = 3
-                hunk += 1
-            else:
-                state = 5
-        elif state == 6:
-            if line.startswith(('rename to ', 'rename from ')):
-                patchbuf += buf + line
-                buf = ''
-            elif line.startswith('--- '):
-                patchbuf += buf + line
-                buf = ''
-                state = 2
-            else:
-                buf += line
-                state = 1
-        else:
-            raise Exception("Unknown state %d! (line '%s')" % (state, line))
-
-    commentbuf += buf
-
-    if patchbuf == '':
-        patchbuf = None
-
-    if commentbuf == '':
-        commentbuf = None
-
-    return patchbuf, commentbuf
-
-
 def parse_pull_request(content):
     git_re = re.compile(r'^The following changes since commit.*' +
                         r'^are available in the git repository at:\n'
@@ -940,7 +718,7 @@ def find_filenames(diff):
         if len(line) <= 0:
             continue
 
-        filename_match = _filename_re.match(line)
+        filename_match = FILENAME_RE.match(line)
         if not filename_match:
             continue
 
diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py
index e4a379d..00891f9 100644
--- a/patchwork/tests/test_parser.py
+++ b/patchwork/tests/test_parser.py
@@ -27,13 +27,13 @@ import os
 from django.test import TestCase
 from django.utils import six
 
+from patchwork.extractor import find_content
 from patchwork.models import Comment
 from patchwork.models import Patch
 from patchwork.models import Person
 from patchwork.models import State
 from patchwork.parser import clean_subject
 from patchwork.parser import find_author
-from patchwork.parser import find_content
 from patchwork.parser import find_project_by_header
 from patchwork.parser import find_series
 from patchwork.parser import parse_mail as _parse_mail
-- 
2.9.3