[PATCH 04/11] parsemail: Move parsing to 'parser'

Stephen Finucane stephen.finucane at intel.com
Wed Jul 13 19:40:53 AEST 2016


Separate the parsing of mails from the CLI aspects of parsemail. Since
there is already a 'parser' module, it makes sense to place the parsing
functionality here. This will allow for additional uses of this parsing
functionality in the future.

Signed-off-by: Stephen Finucane <stephen.finucane at intel.com>
---
 patchwork/bin/parsearchive.py  |    4 +-
 patchwork/bin/parsemail.py     |  523 +---------------------------------------
 patchwork/parser.py            |  526 +++++++++++++++++++++++++++++++++++++++-
 patchwork/tests/test_parser.py |   17 +-
 4 files changed, 536 insertions(+), 534 deletions(-)

diff --git a/patchwork/bin/parsearchive.py b/patchwork/bin/parsearchive.py
index 30bca13..8986b22 100755
--- a/patchwork/bin/parsearchive.py
+++ b/patchwork/bin/parsearchive.py
@@ -29,7 +29,7 @@ import mailbox
 
 import django
 
-from patchwork.bin import parsemail
+from patchwork.parser import parse_mail
 from patchwork import models
 
 LOGGER = logging.getLogger(__name__)
@@ -55,7 +55,7 @@ def parse_mbox(path, list_id):
     mbox = mailbox.mbox(path)
     for msg in mbox:
         try:
-            obj = parsemail.parse_mail(msg, list_id)
+            obj = parse_mail(msg, list_id)
             if obj:
                 results[type(obj)] += 1
             else:
diff --git a/patchwork/bin/parsemail.py b/patchwork/bin/parsemail.py
index 56cd126..abcee04 100755
--- a/patchwork/bin/parsemail.py
+++ b/patchwork/bin/parsemail.py
@@ -22,29 +22,15 @@
 from __future__ import absolute_import
 
 import argparse
-import codecs
-import datetime
 from email import message_from_file
-from email.header import Header, decode_header
-from email.utils import parsedate_tz, mktime_tz
-from fnmatch import fnmatch
-from functools import reduce
 import logging
-import operator
-import re
 import sys
 
 import django
 from django.conf import settings
-from django.contrib.auth.models import User
 from django.utils.log import AdminEmailHandler
-from django.utils import six
-from django.utils.six.moves import map
 
-from patchwork.models import (Patch, Project, Person, Comment, State,
-                              DelegationRule, Submission, CoverLetter,
-                              get_default_initial_patch_state)
-from patchwork.parser import parse_patch, find_filenames
+from patchwork.parser import parse_mail
 
 LOGGER = logging.getLogger(__name__)
 
@@ -56,513 +42,6 @@ VERBOSITY_LEVELS = {
     'critical': logging.CRITICAL
 }
 
-list_id_headers = ['List-ID', 'X-Mailing-List', 'X-list']
-
-
-def normalise_space(str):
-    whitespace_re = re.compile(r'\s+')
-    return whitespace_re.sub(' ', str).strip()
-
-
-def clean_header(header):
-    """Decode (possibly non-ascii) headers."""
-    def decode(fragment):
-        (frag_str, frag_encoding) = fragment
-        if frag_encoding:
-            return frag_str.decode(frag_encoding)
-        elif isinstance(frag_str, six.binary_type):  # python 2
-            return frag_str.decode()
-        return frag_str
-
-    fragments = list(map(decode, decode_header(header)))
-
-    return normalise_space(u' '.join(fragments))
-
-
-def find_project_by_id(list_id):
-    """Find a `project` object with given `list_id`."""
-    project = None
-    try:
-        project = Project.objects.get(listid=list_id)
-    except Project.DoesNotExist:
-        pass
-    return project
-
-
-def find_project_by_header(mail):
-    project = None
-    listid_res = [re.compile(r'.*<([^>]+)>.*', re.S),
-                  re.compile(r'^([\S]+)$', re.S)]
-
-    for header in list_id_headers:
-        if header in mail:
-
-            for listid_re in listid_res:
-                match = listid_re.match(mail.get(header))
-                if match:
-                    break
-
-            if not match:
-                continue
-
-            listid = match.group(1)
-
-            project = find_project_by_id(listid)
-            if project:
-                break
-
-    return project
-
-
-def find_author(mail):
-
-    from_header = clean_header(mail.get('From'))
-    name, email = (None, None)
-
-    # tuple of (regex, fn)
-    #  - where fn returns a (name, email) tuple from the match groups resulting
-    #    from re.match().groups()
-    from_res = [
-        # for "Firstname Lastname" <example at example.com> style addresses
-        (re.compile(r'"?(.*?)"?\s*<([^>]+)>'), (lambda g: (g[0], g[1]))),
-
-        # for example at example.com (Firstname Lastname) style addresses
-        (re.compile(r'"?(.*?)"?\s*\(([^\)]+)\)'), (lambda g: (g[1], g[0]))),
-
-        # for example at example.com (Firstname Lastname) style addresses
-        (re.compile(r'(.*?)\sat\s(.*?)\s*\(([^\)]+)\)'),
-         (lambda g: (g[2], '@'.join(g[0:2])))),
-
-        # everything else
-        (re.compile(r'(.*)'), (lambda g: (None, g[0]))),
-    ]
-
-    for regex, fn in from_res:
-        match = regex.match(from_header)
-        if match:
-            (name, email) = fn(match.groups())
-            break
-
-    if email is None:
-        raise ValueError("Invalid 'From' header")
-
-    email = email.strip()
-    if name is not None:
-        name = name.strip()
-
-    try:
-        person = Person.objects.get(email__iexact=email)
-        if name:  # use the latest provided name
-            person.name = name
-    except Person.DoesNotExist:
-        person = Person(name=name, email=email)
-
-    return person
-
-
-def find_date(mail):
-    t = parsedate_tz(mail.get('Date', ''))
-    if not t:
-        return datetime.datetime.utcnow()
-    return datetime.datetime.utcfromtimestamp(mktime_tz(t))
-
-
-def find_headers(mail):
-    return reduce(operator.__concat__,
-                  ['%s: %s\n' % (k, Header(v, header_name=k,
-                                           continuation_ws='\t').encode())
-                   for (k, v) in list(mail.items())])
-
-
-def find_pull_request(content):
-    git_re = re.compile(r'^The following changes since commit.*' +
-                        r'^are available in the git repository at:\n'
-                        r'^\s*([\S]+://[^\n]+)$',
-                        re.DOTALL | re.MULTILINE)
-    match = git_re.search(content)
-    if match:
-        return match.group(1)
-    return None
-
-
-def find_references(mail):
-    """Construct a list of possible reply message ids."""
-    refs = []
-
-    if 'In-Reply-To' in mail:
-        refs.append(mail.get('In-Reply-To'))
-
-    if 'References' in mail:
-        rs = mail.get('References').split()
-        rs.reverse()
-        for r in rs:
-            if r not in refs:
-                refs.append(r)
-
-    return refs
-
-
-def parse_series_marker(subject_prefixes):
-    """Extract series markers from subject.
-
-    Extract the markers of multi-patches series, i.e. 'x/n', from the
-    provided subject series.
-
-    Args:
-        subject_prefixes: List of subject prefixes to extract markers
-          from
-
-    Returns:
-        (x, n) if markers found, else (None, None)
-    """
-
-    regex = re.compile('^([0-9]+)/([0-9]+)$')
-    for prefix in subject_prefixes:
-        m = regex.match(prefix)
-        if not m:
-            continue
-        return (int(m.group(1)), int(m.group(2)))
-    return (None, None)
-
-
-def find_content(project, mail):
-    patchbuf = None
-    commentbuf = ''
-
-    for part in mail.walk():
-        if part.get_content_maintype() != 'text':
-            continue
-
-        payload = part.get_payload(decode=True)
-        subtype = part.get_content_subtype()
-
-        if not isinstance(payload, six.text_type):
-            charset = part.get_content_charset()
-
-            # Check that we have a charset that we understand. Otherwise,
-            # ignore it and fallback to our standard set.
-            if charset is not None:
-                try:
-                    codecs.lookup(charset)
-                except LookupError:
-                    charset = None
-
-            # If there is no charset or if it is unknown, then try some common
-            # charsets before we fail.
-            if charset is None:
-                try_charsets = ['utf-8', 'windows-1252', 'iso-8859-1']
-            else:
-                try_charsets = [charset]
-
-            for cset in try_charsets:
-                try:
-                    payload = six.text_type(payload, cset)
-                    break
-                except UnicodeDecodeError:
-                    payload = None
-
-            # Could not find a valid decoded payload.  Fail.
-            if payload is None:
-                return None, None
-
-        if subtype in ['x-patch', 'x-diff']:
-            patchbuf = payload
-        elif subtype == 'plain':
-            c = payload
-
-            if not patchbuf:
-                patchbuf, c = parse_patch(payload)
-
-            if c is not None:
-                commentbuf += c.strip() + '\n'
-
-    commentbuf = clean_content(commentbuf)
-
-    return patchbuf, commentbuf
-
-
-def find_submission_for_comment(project, refs):
-    for ref in refs:
-        # first, check for a direct reply
-        try:
-            submission = Submission.objects.get(project=project, msgid=ref)
-            return submission
-        except Submission.DoesNotExist:
-            pass
-
-        # see if we have comments that refer to a patch
-        try:
-            comment = Comment.objects.get(submission__project=project,
-                                          msgid=ref)
-            return comment.submission
-        except Comment.MultipleObjectsReturned:
-            # NOTE(stephenfin): This is a artifact of prior lack of support
-            # for cover letters in Patchwork. Previously all replies to
-            # patches were saved as comments. However, it's possible that
-            # someone could have created a new series as a reply to one of the
-            # comments on the original patch series. For example,
-            # '2015-November/002096.html' from the Patchwork archives. In this
-            # case, reparsing the archives will result in creation of a cover
-            # letter with the same message ID as the existing comment. Follow
-            # up comments will then apply to both this cover letter and the
-            # linked patch from the comment previously created. We choose to
-            # apply the comment to the cover letter. Note that this only
-            # happens when running 'parsearchive' or similar, so it should not
-            # affect every day use in any way.
-            comments = Comment.objects.filter(submission__project=project,
-                                              msgid=ref)
-            # The latter item will be the cover letter
-            return comments.reverse()[0].submission
-        except Comment.DoesNotExist:
-            pass
-
-    return None
-
-
-def split_prefixes(prefix):
-    """Turn a prefix string into a list of prefix tokens."""
-    split_re = re.compile(r'[,\s]+')
-    matches = split_re.split(prefix)
-
-    return [s for s in matches if s != '']
-
-
-def clean_subject(subject, drop_prefixes=None):
-    """Clean a Subject: header from an incoming patch.
-
-    Removes Re: and Fwd: strings, as well as [PATCH]-style prefixes. By
-    default, only [PATCH] is removed, and we keep any other bracketed
-    data in the subject. If drop_prefixes is provided, remove those
-    too, comparing case-insensitively.
-
-    Args:
-        subject: Subject to be cleaned
-        drop_prefixes: Additional, case-insensitive prefixes to remove
-          from the subject
-    """
-    re_re = re.compile(r'^(re|fwd?)[:\s]\s*', re.I)
-    prefix_re = re.compile(r'^\[([^\]]*)\]\s*(.*)$')
-    subject = clean_header(subject)
-
-    if drop_prefixes is None:
-        drop_prefixes = []
-    else:
-        drop_prefixes = [s.lower() for s in drop_prefixes]
-
-    drop_prefixes.append('patch')
-
-    # remove Re:, Fwd:, etc
-    subject = re_re.sub(' ', subject)
-
-    subject = normalise_space(subject)
-
-    prefixes = []
-
-    match = prefix_re.match(subject)
-
-    while match:
-        prefix_str = match.group(1)
-        prefixes += [p for p in split_prefixes(prefix_str)
-                     if p.lower() not in drop_prefixes]
-
-        subject = match.group(2)
-        match = prefix_re.match(subject)
-
-    subject = normalise_space(subject)
-
-    subject = subject.strip()
-    if prefixes:
-        subject = '[%s] %s' % (','.join(prefixes), subject)
-
-    return (subject, prefixes)
-
-
-def clean_content(content):
-    """Remove cruft from the email message.
-
-    Catch signature (-- ) and list footer (_____) cruft.
-    """
-    sig_re = re.compile(r'^(-- |_+)\n.*', re.S | re.M)
-    content = sig_re.sub('', content)
-
-    return content.strip()
-
-
-def find_state(mail):
-    """Return the state with the given name or the default."""
-    state_name = mail.get('X-Patchwork-State', '').strip()
-    if state_name:
-        try:
-            return State.objects.get(name__iexact=state_name)
-        except State.DoesNotExist:
-            pass
-    return get_default_initial_patch_state()
-
-
-def auto_delegate(project, filenames):
-    if not filenames:
-        return None
-
-    rules = list(DelegationRule.objects.filter(project=project))
-
-    patch_delegate = None
-
-    for filename in filenames:
-        file_delegate = None
-        for rule in rules:
-            if fnmatch(filename, rule.path):
-                file_delegate = rule.user
-                break
-
-        if file_delegate is None:
-            return None
-
-        if patch_delegate is not None and file_delegate != patch_delegate:
-            return None
-
-        patch_delegate = file_delegate
-
-    return patch_delegate
-
-
-def find_delegate(mail):
-    """Return the delegate with the given email or None."""
-    delegate_email = mail.get('X-Patchwork-Delegate', '').strip()
-    if delegate_email:
-        try:
-            return User.objects.get(email__iexact=delegate_email)
-        except User.DoesNotExist:
-            pass
-    return None
-
-
-def parse_mail(mail, list_id=None):
-    """Parse a mail and add to the database.
-
-    Args:
-        mail (`mbox.Mail`): Mail to parse and add.
-        list_id (str): Mailing list ID
-
-    Returns:
-        None
-    """
-    # some basic sanity checks
-    if 'From' not in mail:
-        raise ValueError("Missing 'From' header")
-
-    if 'Subject' not in mail:
-        raise ValueError("Missing 'Subject' header")
-
-    if 'Message-Id' not in mail:
-        raise ValueError("Missing 'Message-Id' header")
-
-    hint = mail.get('X-Patchwork-Hint', '').lower()
-    if hint == 'ignore':
-        LOGGER.debug("Ignoring email due to 'ignore' hint")
-        return
-
-    if list_id:
-        project = find_project_by_id(list_id)
-    else:
-        project = find_project_by_header(mail)
-
-    if project is None:
-        LOGGER.error('Failed to find a project for email')
-        return
-
-    # parse content
-
-    diff, message = find_content(project, mail)
-
-    if not (diff or message):
-        return  # nothing to work with
-
-    msgid = mail.get('Message-Id').strip()
-    author = find_author(mail)
-    name, prefixes = clean_subject(mail.get('Subject'), [project.linkname])
-    x, n = parse_series_marker(prefixes)
-    refs = find_references(mail)
-    date = find_date(mail)
-    headers = find_headers(mail)
-    pull_url = find_pull_request(message)
-
-    # build objects
-
-    if diff or pull_url:  # patches or pull requests
-        # we delay the saving until we know we have a patch.
-        author.save()
-
-        delegate = find_delegate(mail)
-        if not delegate and diff:
-            filenames = find_filenames(diff)
-            delegate = auto_delegate(project, filenames)
-
-        patch = Patch(
-            msgid=msgid,
-            project=project,
-            name=name,
-            date=date,
-            headers=headers,
-            submitter=author,
-            content=message,
-            diff=diff,
-            pull_url=pull_url,
-            delegate=delegate,
-            state=find_state(mail))
-        patch.save()
-        LOGGER.debug('Patch saved')
-
-        return patch
-    elif x == 0:  # (potential) cover letters
-        # if refs are empty, it's implicitly a cover letter. If not,
-        # however, we need to see if a match already exists and, if
-        # not, assume that it is indeed a new cover letter
-        is_cover_letter = False
-        if not refs == []:
-            try:
-                CoverLetter.objects.all().get(name=name)
-            except CoverLetter.DoesNotExist:  # no match => new cover
-                is_cover_letter = True
-        else:
-            is_cover_letter = True
-
-        if is_cover_letter:
-            author.save()
-
-            cover_letter = CoverLetter(
-                msgid=msgid,
-                project=project,
-                name=name,
-                date=date,
-                headers=headers,
-                submitter=author,
-                content=message)
-            cover_letter.save()
-            LOGGER.debug('Cover letter saved')
-
-            return cover_letter
-
-    # comments
-
-    # we only save comments if we have the parent email
-    submission = find_submission_for_comment(project, refs)
-    if not submission:
-        return
-
-    author.save()
-
-    comment = Comment(
-        submission=submission,
-        msgid=msgid,
-        date=date,
-        headers=headers,
-        submitter=author,
-        content=message)
-    comment.save()
-    LOGGER.debug('Comment saved')
-
-    return comment
-
 extra_error_message = '''
 == Mail
 
diff --git a/patchwork/parser.py b/patchwork/parser.py
index c9c058d..938b965 100644
--- a/patchwork/parser.py
+++ b/patchwork/parser.py
@@ -19,13 +19,359 @@
 # along with Patchwork; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
+import codecs
+import datetime
+from email.header import Header, decode_header
+from email.utils import parsedate_tz, mktime_tz
+from fnmatch import fnmatch
+from functools import reduce
+import logging
+import operator
 import re
 
+from django.contrib.auth.models import User
+from django.utils import six
 from django.utils.six.moves import map
 
+from patchwork.models import (Patch, Project, Person, Comment, State,
+                              DelegationRule, Submission, CoverLetter,
+                              get_default_initial_patch_state)
 
-_hunk_re = re.compile('^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
-_filename_re = re.compile('^(---|\+\+\+) (\S+)')
+
+_hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
+_filename_re = re.compile(r'^(---|\+\+\+) (\S+)')
+list_id_headers = ['List-ID', 'X-Mailing-List', 'X-list']
+
+LOGGER = logging.getLogger(__name__)
+
+
+def normalise_space(str):
+    whitespace_re = re.compile(r'\s+')
+    return whitespace_re.sub(' ', str).strip()
+
+
+def clean_header(header):
+    """Decode (possibly non-ascii) headers."""
+    def decode(fragment):
+        (frag_str, frag_encoding) = fragment
+        if frag_encoding:
+            return frag_str.decode(frag_encoding)
+        elif isinstance(frag_str, six.binary_type):  # python 2
+            return frag_str.decode()
+        return frag_str
+
+    fragments = list(map(decode, decode_header(header)))
+
+    return normalise_space(u' '.join(fragments))
+
+
+def find_project_by_id(list_id):
+    """Find a `project` object with given `list_id`."""
+    project = None
+    try:
+        project = Project.objects.get(listid=list_id)
+    except Project.DoesNotExist:
+        pass
+    return project
+
+
+def find_project_by_header(mail):
+    project = None
+    listid_res = [re.compile(r'.*<([^>]+)>.*', re.S),
+                  re.compile(r'^([\S]+)$', re.S)]
+
+    for header in list_id_headers:
+        if header in mail:
+
+            for listid_re in listid_res:
+                match = listid_re.match(mail.get(header))
+                if match:
+                    break
+
+            if not match:
+                continue
+
+            listid = match.group(1)
+
+            project = find_project_by_id(listid)
+            if project:
+                break
+
+    return project
+
+
+def find_author(mail):
+    from_header = clean_header(mail.get('From'))
+    name, email = (None, None)
+
+    # tuple of (regex, fn)
+    #  - where fn returns a (name, email) tuple from the match groups resulting
+    #    from re.match().groups()
+    from_res = [
+        # for "Firstname Lastname" <example at example.com> style addresses
+        (re.compile(r'"?(.*?)"?\s*<([^>]+)>'), (lambda g: (g[0], g[1]))),
+
+        # for example at example.com (Firstname Lastname) style addresses
+        (re.compile(r'"?(.*?)"?\s*\(([^\)]+)\)'), (lambda g: (g[1], g[0]))),
+
+        # for example at example.com (Firstname Lastname) style addresses
+        (re.compile(r'(.*?)\sat\s(.*?)\s*\(([^\)]+)\)'),
+         (lambda g: (g[2], '@'.join(g[0:2])))),
+
+        # everything else
+        (re.compile(r'(.*)'), (lambda g: (None, g[0]))),
+    ]
+
+    for regex, fn in from_res:
+        match = regex.match(from_header)
+        if match:
+            (name, email) = fn(match.groups())
+            break
+
+    if email is None:
+        raise ValueError("Invalid 'From' header")
+
+    email = email.strip()
+    if name is not None:
+        name = name.strip()
+
+    try:
+        person = Person.objects.get(email__iexact=email)
+        if name:  # use the latest provided name
+            person.name = name
+    except Person.DoesNotExist:
+        person = Person(name=name, email=email)
+
+    return person
+
+
+def find_date(mail):
+    t = parsedate_tz(mail.get('Date', ''))
+    if not t:
+        return datetime.datetime.utcnow()
+    return datetime.datetime.utcfromtimestamp(mktime_tz(t))
+
+
+def find_headers(mail):
+    return reduce(operator.__concat__,
+                  ['%s: %s\n' % (k, Header(v, header_name=k,
+                                           continuation_ws='\t').encode())
+                   for (k, v) in list(mail.items())])
+
+
+def find_pull_request(content):
+    git_re = re.compile(r'^The following changes since commit.*' +
+                        r'^are available in the git repository at:\n'
+                        r'^\s*([\S]+://[^\n]+)$',
+                        re.DOTALL | re.MULTILINE)
+    match = git_re.search(content)
+    if match:
+        return match.group(1)
+    return None
+
+
+def find_references(mail):
+    """Construct a list of possible reply message ids."""
+    refs = []
+
+    if 'In-Reply-To' in mail:
+        refs.append(mail.get('In-Reply-To'))
+
+    if 'References' in mail:
+        rs = mail.get('References').split()
+        rs.reverse()
+        for r in rs:
+            if r not in refs:
+                refs.append(r)
+
+    return refs
+
+
+def parse_series_marker(subject_prefixes):
+    """Extract series markers from subject.
+
+    Extract the markers of multi-patches series, i.e. 'x/n', from the
+    provided subject series.
+
+    Args:
+        subject_prefixes: List of subject prefixes to extract markers
+          from
+
+    Returns:
+        (x, n) if markers found, else (None, None)
+    """
+
+    regex = re.compile('^([0-9]+)/([0-9]+)$')
+    for prefix in subject_prefixes:
+        m = regex.match(prefix)
+        if not m:
+            continue
+        return (int(m.group(1)), int(m.group(2)))
+    return (None, None)
+
+
+def find_content(project, mail):
+    """Extract a comment and potential diff from a mail."""
+    patchbuf = None
+    commentbuf = ''
+
+    for part in mail.walk():
+        if part.get_content_maintype() != 'text':
+            continue
+
+        payload = part.get_payload(decode=True)
+        subtype = part.get_content_subtype()
+
+        if not isinstance(payload, six.text_type):
+            charset = part.get_content_charset()
+
+            # Check that we have a charset that we understand. Otherwise,
+            # ignore it and fallback to our standard set.
+            if charset is not None:
+                try:
+                    codecs.lookup(charset)
+                except LookupError:
+                    charset = None
+
+            # If there is no charset or if it is unknown, then try some common
+            # charsets before we fail.
+            if charset is None:
+                try_charsets = ['utf-8', 'windows-1252', 'iso-8859-1']
+            else:
+                try_charsets = [charset]
+
+            for cset in try_charsets:
+                try:
+                    payload = six.text_type(payload, cset)
+                    break
+                except UnicodeDecodeError:
+                    payload = None
+
+            # Could not find a valid decoded payload.  Fail.
+            if payload is None:
+                return None, None
+
+        if subtype in ['x-patch', 'x-diff']:
+            patchbuf = payload
+        elif subtype == 'plain':
+            c = payload
+
+            if not patchbuf:
+                patchbuf, c = parse_patch(payload)
+
+            if c is not None:
+                commentbuf += c.strip() + '\n'
+
+    commentbuf = clean_content(commentbuf)
+
+    return patchbuf, commentbuf
+
+
+def find_submission_for_comment(project, refs):
+    for ref in refs:
+        # first, check for a direct reply
+        try:
+            submission = Submission.objects.get(project=project, msgid=ref)
+            return submission
+        except Submission.DoesNotExist:
+            pass
+
+        # see if we have comments that refer to a patch
+        try:
+            comment = Comment.objects.get(submission__project=project,
+                                          msgid=ref)
+            return comment.submission
+        except Comment.MultipleObjectsReturned:
+            # NOTE(stephenfin): This is a artifact of prior lack of support
+            # for cover letters in Patchwork. Previously all replies to
+            # patches were saved as comments. However, it's possible that
+            # someone could have created a new series as a reply to one of the
+            # comments on the original patch series. For example,
+            # '2015-November/002096.html' from the Patchwork archives. In this
+            # case, reparsing the archives will result in creation of a cover
+            # letter with the same message ID as the existing comment. Follow
+            # up comments will then apply to both this cover letter and the
+            # linked patch from the comment previously created. We choose to
+            # apply the comment to the cover letter. Note that this only
+            # happens when running 'parsearchive' or similar, so it should not
+            # affect every day use in any way.
+            comments = Comment.objects.filter(submission__project=project,
+                                              msgid=ref)
+            # The latter item will be the cover letter
+            return comments.reverse()[0].submission
+        except Comment.DoesNotExist:
+            pass
+
+    return None
+
+
+def split_prefixes(prefix):
+    """Turn a prefix string into a list of prefix tokens."""
+    split_re = re.compile(r'[,\s]+')
+    matches = split_re.split(prefix)
+
+    return [s for s in matches if s != '']
+
+
+def clean_subject(subject, drop_prefixes=None):
+    """Clean a Subject: header from an incoming patch.
+
+    Removes Re: and Fwd: strings, as well as [PATCH]-style prefixes. By
+    default, only [PATCH] is removed, and we keep any other bracketed
+    data in the subject. If drop_prefixes is provided, remove those
+    too, comparing case-insensitively.
+
+    Args:
+        subject: Subject to be cleaned
+        drop_prefixes: Additional, case-insensitive prefixes to remove
+          from the subject
+    """
+    re_re = re.compile(r'^(re|fwd?)[:\s]\s*', re.I)
+    prefix_re = re.compile(r'^\[([^\]]*)\]\s*(.*)$')
+    subject = clean_header(subject)
+
+    if drop_prefixes is None:
+        drop_prefixes = []
+    else:
+        drop_prefixes = [s.lower() for s in drop_prefixes]
+
+    drop_prefixes.append('patch')
+
+    # remove Re:, Fwd:, etc
+    subject = re_re.sub(' ', subject)
+
+    subject = normalise_space(subject)
+
+    prefixes = []
+
+    match = prefix_re.match(subject)
+
+    while match:
+        prefix_str = match.group(1)
+        prefixes += [p for p in split_prefixes(prefix_str)
+                     if p.lower() not in drop_prefixes]
+
+        subject = match.group(2)
+        match = prefix_re.match(subject)
+
+    subject = normalise_space(subject)
+
+    subject = subject.strip()
+    if prefixes:
+        subject = '[%s] %s' % (','.join(prefixes), subject)
+
+    return (subject, prefixes)
+
+
+def clean_content(content):
+    """Remove cruft from the email message.
+
+    Catch signature (-- ) and list footer (_____) cruft.
+    """
+    sig_re = re.compile(r'^(-- |_+)\n.*', re.S | re.M)
+    content = sig_re.sub('', content)
+
+    return content.strip()
 
 
 def parse_patch(content):
@@ -181,6 +527,182 @@ def parse_patch(content):
     return patchbuf, commentbuf
 
 
+def find_state(mail):
+    """Return the state with the given name or the default."""
+    state_name = mail.get('X-Patchwork-State', '').strip()
+    if state_name:
+        try:
+            return State.objects.get(name__iexact=state_name)
+        except State.DoesNotExist:
+            pass
+    return get_default_initial_patch_state()
+
+
+def auto_delegate(project, filenames):
+    if not filenames:
+        return None
+
+    rules = list(DelegationRule.objects.filter(project=project))
+
+    patch_delegate = None
+
+    for filename in filenames:
+        file_delegate = None
+        for rule in rules:
+            if fnmatch(filename, rule.path):
+                file_delegate = rule.user
+                break
+
+        if file_delegate is None:
+            return None
+
+        if patch_delegate is not None and file_delegate != patch_delegate:
+            return None
+
+        patch_delegate = file_delegate
+
+    return patch_delegate
+
+
+def find_delegate(mail):
+    """Return the delegate with the given email or None."""
+    delegate_email = mail.get('X-Patchwork-Delegate', '').strip()
+    if delegate_email:
+        try:
+            return User.objects.get(email__iexact=delegate_email)
+        except User.DoesNotExist:
+            pass
+    return None
+
+
+def parse_mail(mail, list_id=None):
+    """Parse a mail and add to the database.
+
+    Args:
+        mail (`mbox.Mail`): Mail to parse and add.
+        list_id (str): Mailing list ID
+
+    Returns:
+        None
+    """
+    # some basic sanity checks
+    if 'From' not in mail:
+        raise ValueError("Missing 'From' header")
+
+    if 'Subject' not in mail:
+        raise ValueError("Missing 'Subject' header")
+
+    if 'Message-Id' not in mail:
+        raise ValueError("Missing 'Message-Id' header")
+
+    hint = mail.get('X-Patchwork-Hint', '').lower()
+    if hint == 'ignore':
+        LOGGER.debug("Ignoring email due to 'ignore' hint")
+        return
+
+    if list_id:
+        project = find_project_by_id(list_id)
+    else:
+        project = find_project_by_header(mail)
+
+    if project is None:
+        LOGGER.error('Failed to find a project for email')
+        return
+
+    # parse content
+
+    diff, message = find_content(project, mail)
+
+    if not (diff or message):
+        return  # nothing to work with
+
+    msgid = mail.get('Message-Id').strip()
+    author = find_author(mail)
+    name, prefixes = clean_subject(mail.get('Subject'), [project.linkname])
+    x, n = parse_series_marker(prefixes)
+    refs = find_references(mail)
+    date = find_date(mail)
+    headers = find_headers(mail)
+    pull_url = find_pull_request(message)
+
+    # build objects
+
+    if diff or pull_url:  # patches or pull requests
+        # we delay the saving until we know we have a patch.
+        author.save()
+
+        delegate = find_delegate(mail)
+        if not delegate and diff:
+            filenames = find_filenames(diff)
+            delegate = auto_delegate(project, filenames)
+
+        patch = Patch(
+            msgid=msgid,
+            project=project,
+            name=name,
+            date=date,
+            headers=headers,
+            submitter=author,
+            content=message,
+            diff=diff,
+            pull_url=pull_url,
+            delegate=delegate,
+            state=find_state(mail))
+        patch.save()
+        LOGGER.debug('Patch saved')
+
+        return patch
+    elif x == 0:  # (potential) cover letters
+        # if refs are empty, it's implicitly a cover letter. If not,
+        # however, we need to see if a match already exists and, if
+        # not, assume that it is indeed a new cover letter
+        is_cover_letter = False
+        if not refs == []:
+            try:
+                CoverLetter.objects.all().get(name=name)
+            except CoverLetter.DoesNotExist:  # no match => new cover
+                is_cover_letter = True
+        else:
+            is_cover_letter = True
+
+        if is_cover_letter:
+            author.save()
+
+            cover_letter = CoverLetter(
+                msgid=msgid,
+                project=project,
+                name=name,
+                date=date,
+                headers=headers,
+                submitter=author,
+                content=message)
+            cover_letter.save()
+            LOGGER.debug('Cover letter saved')
+
+            return cover_letter
+
+    # comments
+
+    # we only save comments if we have the parent email
+    submission = find_submission_for_comment(project, refs)
+    if not submission:
+        return
+
+    author.save()
+
+    comment = Comment(
+        submission=submission,
+        msgid=msgid,
+        date=date,
+        headers=headers,
+        submitter=author,
+        content=message)
+    comment.save()
+    LOGGER.debug('Comment saved')
+
+    return comment
+
+
 def find_filenames(diff):
     """Find files changes in a given diff."""
     # normalise spaces
diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py
index eca05a0..684a667 100644
--- a/patchwork/tests/test_parser.py
+++ b/patchwork/tests/test_parser.py
@@ -26,18 +26,19 @@ import os
 
 from django.test import TestCase
 
-from patchwork.bin.parsemail import clean_subject
-from patchwork.bin.parsemail import find_author
-from patchwork.bin.parsemail import find_content
-from patchwork.bin.parsemail import find_project_by_header
-from patchwork.bin.parsemail import find_pull_request
-from patchwork.bin.parsemail import parse_mail as _parse_mail
-from patchwork.bin.parsemail import parse_series_marker
-from patchwork.bin.parsemail import split_prefixes
 from patchwork.models import Comment
 from patchwork.models import Patch
 from patchwork.models import Person
 from patchwork.models import State
+from patchwork.parser import clean_subject
+from patchwork.parser import find_author
+from patchwork.parser import find_content
+from patchwork.parser import find_project_by_header
+from patchwork.parser import find_pull_request
+from patchwork.parser import parse_mail as _parse_mail
+from patchwork.parser import parse_series_marker
+from patchwork.parser import split_prefixes
+from patchwork.tests.utils import create_email
 from patchwork.tests.utils import create_project
 from patchwork.tests.utils import create_state
 from patchwork.tests.utils import create_user
-- 
1.7.4.1



More information about the Patchwork mailing list