[PATCH v5 3/7] parser: Add series parsing

Stephen Finucane stephen at that.guru
Mon Oct 10 09:25:17 AEDT 2016


It is now possible to parse and store series, so do just that.
The parsing at the moment is based on both RFC822 headers and
subject lines.

Signed-off-by: Stephen Finucane <stephen at that.guru>
---
v4:
- Update per new 'SeriesRevision'-'Patch' relationship
v3:
- Rework how nested series are handled once again
- Don't search for references when creating a cover letter
v2:
- Rebase onto master, moving changes into 'parser'
- Add unit tests
- Merge "Handle 'xxx (v2)' style messages" patch
- Merge "Handle series sent 'in-reply-to'" patch
- Handle capitalized version prefixes like [V2]
- Trivial cleanup of some parser functions
---
 patchwork/parser.py            | 130 +++++++++++++++++++++++++++++++++++++----
 patchwork/tests/test_parser.py | 112 +++++++++++++++++++++++++++++------
 patchwork/tests/utils.py       |  25 ++++++++
 3 files changed, 239 insertions(+), 28 deletions(-)

diff --git a/patchwork/parser.py b/patchwork/parser.py
index 15476b1..227d4d7 100644
--- a/patchwork/parser.py
+++ b/patchwork/parser.py
@@ -21,8 +21,10 @@
 
 import codecs
 import datetime
-from email.header import decode_header, make_header
-from email.utils import parsedate_tz, mktime_tz
+from email.header import decode_header
+from email.header import make_header
+from email.utils import mktime_tz
+from email.utils import parsedate_tz
 from fnmatch import fnmatch
 import logging
 import re
@@ -30,9 +32,17 @@ import re
 from django.contrib.auth.models import User
 from django.utils import six
 
-from patchwork.models import (Patch, Project, Person, Comment, State,
-                              DelegationRule, Submission, CoverLetter,
-                              get_default_initial_patch_state)
+from patchwork.models import Comment
+from patchwork.models import CoverLetter
+from patchwork.models import DelegationRule
+from patchwork.models import get_default_initial_patch_state
+from patchwork.models import Patch
+from patchwork.models import Person
+from patchwork.models import Project
+from patchwork.models import SeriesRevision
+from patchwork.models import SeriesReference
+from patchwork.models import State
+from patchwork.models import Submission
 
 
 _hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
@@ -162,6 +172,34 @@ def find_project_by_header(mail):
     return project
 
 
+def find_series(mail):
+    """Find a patch's `SeriesRevision`.
+
+    Traverse RFC822 headers, starting with most recent first, to find
+    ancestors and the related series. Headers are traversed in reverse
+    to handle series sent in reply to previous series, like so:
+
+        [PATCH 0/3] A cover letter
+          [PATCH 1/3] The first patch
+          ...
+          [PATCH v2 0/3] A cover letter
+            [PATCH v2 1/3] The first patch
+            ...
+
+    Args:
+        mail (email.message.Message): The mail to extract series from
+
+    Returns:
+        The matching `SeriesRevision` instance, if any
+    """
+    for ref in find_references(mail, True):
+        # try parsing by RFC5322 fields first
+        try:
+            return SeriesReference.objects.get(msgid=ref).series
+        except SeriesReference.DoesNotExist:
+            pass
+
+
 def find_author(mail):
     from_header = clean_header(mail.get('From'))
     name, email = (None, None)
@@ -226,10 +264,13 @@ def find_headers(mail):
     return '\n'.join(strings)
 
 
-def find_references(mail):
+def find_references(mail, include_msgid=False):
     """Construct a list of possible reply message ids."""
     refs = []
 
+    if include_msgid:
+        refs.append(mail.get('Message-ID'))
+
     if 'In-Reply-To' in mail:
         refs.append(mail.get('In-Reply-To'))
 
@@ -243,6 +284,13 @@ def find_references(mail):
     return refs
 
 
+def _parse_prefixes(subject_prefixes, regex):
+    for prefix in subject_prefixes:
+        m = regex.match(prefix)
+        if m:
+            return m
+
+
 def parse_series_marker(subject_prefixes):
     """Extract series markers from subject.
 
@@ -258,14 +306,36 @@ def parse_series_marker(subject_prefixes):
     """
 
     regex = re.compile('^([0-9]+)/([0-9]+)$')
-    for prefix in subject_prefixes:
-        m = regex.match(prefix)
-        if not m:
-            continue
+    m = _parse_prefixes(subject_prefixes, regex)
+    if m:
         return (int(m.group(1)), int(m.group(2)))
+
     return (None, None)
 
 
+def parse_version(subject, subject_prefixes):
+    """Extract patch version.
+
+    Args:
+        subject: Main body of subject line
+        subject_prefixes: List of subject prefixes to extract version
+          from
+
+    Returns:
+        version if found, else 1
+    """
+    regex = re.compile('^[vV](\d+)$')
+    m = _parse_prefixes(subject_prefixes, regex)
+    if m:
+        return int(m.group(1))
+
+    m = re.search(r'\([vV](\d+)\)', subject)
+    if m:
+        return int(m.group(1))
+
+    return 1
+
+
 def find_content(project, mail):
     """Extract a comment and potential diff from a mail."""
     patchbuf = None
@@ -696,6 +766,7 @@ def parse_mail(mail, list_id=None):
     name, prefixes = clean_subject(subject, [project.linkname])
     is_comment = subject_check(subject)
     x, n = parse_series_marker(prefixes)
+    version = parse_version(name, prefixes)
     refs = find_references(mail)
     date = find_date(mail)
     headers = find_headers(mail)
@@ -712,6 +783,18 @@ def parse_mail(mail, list_id=None):
             filenames = find_filenames(diff)
             delegate = auto_delegate(project, filenames)
 
+        series = find_series(mail)
+        if not series and n:  # the series markers indicates a series
+            series = SeriesRevision(date=date,
+                                    submitter=author,
+                                    version=version,
+                                    total=n)
+            series.save()
+
+            for ref in refs + [msgid]:  # save references for series
+                # we don't want duplicates
+                SeriesReference.objects.get_or_create(series=series, msgid=ref)
+
         patch = Patch(
             msgid=msgid,
             project=project,
@@ -727,6 +810,9 @@ def parse_mail(mail, list_id=None):
         patch.save()
         logger.debug('Patch saved')
 
+        if series:
+            series.add_patch(patch, x)
+
         return patch
     elif x == 0:  # (potential) cover letters
         # if refs are empty, it's implicitly a cover letter. If not,
@@ -749,6 +835,28 @@ def parse_mail(mail, list_id=None):
         if is_cover_letter:
             author.save()
 
+            # we don't use 'find_series' here as a cover letter will
+            # always be the first item in a thread, thus the references
+            # could only point to a different series or unrelated
+            # message
+            try:
+                series = SeriesReference.objects.get(msgid=msgid).series
+            except SeriesReference.DoesNotExist:
+                series = None
+
+            if not series:
+                series = SeriesRevision(date=date,
+                                        submitter=author,
+                                        version=version,
+                                        total=n)
+                series.save()
+
+                # we don't save the in-reply-to or references fields
+                # for a cover letter, as they can't refer to the same
+                # series
+                SeriesReference.objects.get_or_create(series=series,
+                                                      msgid=msgid)
+
             cover_letter = CoverLetter(
                 msgid=msgid,
                 project=project,
@@ -760,6 +868,8 @@ def parse_mail(mail, list_id=None):
             cover_letter.save()
             logger.debug('Cover letter saved')
 
+            series.add_cover_letter(cover_letter)
+
             return cover_letter
 
     # comments
diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py
index e16ffbc..96166ad 100644
--- a/patchwork/tests/test_parser.py
+++ b/patchwork/tests/test_parser.py
@@ -35,13 +35,16 @@ from patchwork.parser import clean_subject
 from patchwork.parser import find_author
 from patchwork.parser import find_content
 from patchwork.parser import find_project_by_header
+from patchwork.parser import find_series
 from patchwork.parser import parse_mail as _parse_mail
 from patchwork.parser import parse_pull_request
 from patchwork.parser import parse_series_marker
+from patchwork.parser import parse_version
 from patchwork.parser import split_prefixes
 from patchwork.parser import subject_check
 from patchwork.tests import TEST_MAIL_DIR
 from patchwork.tests.utils import create_project
+from patchwork.tests.utils import create_series_reference
 from patchwork.tests.utils import create_state
 from patchwork.tests.utils import create_user
 from patchwork.tests.utils import read_patch
@@ -328,6 +331,72 @@ class SenderCorrelationTest(TestCase):
         self.assertEqual(person_b.id, person_a.id)
 
 
+class SeriesCorrelationTest(TestCase):
+    """Validate correct behavior of find_series."""
+
+    @staticmethod
+    def _create_email(msgid, references=None):
+        """Create a sample mail.
+
+        Arguments:
+            msgid (str): The message's msgid
+            references (list): A list of preceding messages' msgids,
+                oldest first
+        """
+        mail = 'Message-Id: %s\n' % msgid + \
+               'From: example user <user at example.com>\n' + \
+               'Subject: Tests\n'
+
+        if references:
+            mail += 'In-Reply-To: %s\n' % references[-1]
+            mail += 'References: %s\n' % '\n\t'.join(references)
+
+        mail += 'test\n\n' + SAMPLE_DIFF
+        return message_from_string(mail)
+
+    def test_new_series(self):
+        msgid = make_msgid()
+        email = self._create_email(msgid)
+
+        self.assertIsNone(find_series(email))
+
+    def test_first_reply(self):
+        msgid_a = make_msgid()
+        msgid_b = make_msgid()
+        email = self._create_email(msgid_b, [msgid_a])
+
+        # assume msgid_a was already handled
+        ref = create_series_reference(msgid=msgid_a)
+
+        series = find_series(email)
+        self.assertEqual(series, ref.series)
+
+    def test_nested_series(self):
+        """Handle a series sent in-reply-to an existing series."""
+        # create an old series with a "cover letter"
+        msgids = [make_msgid()]
+        ref_v1 = create_series_reference(msgid=msgids[0])
+
+        # ...and three patches
+        for i in range(3):
+            msgids.append(make_msgid())
+            create_series_reference(msgid=msgids[-1],
+                                    series=ref_v1.series)
+
+        # now create a new series with "cover letter"
+        msgids.append(make_msgid())
+        ref_v2 = create_series_reference(msgid=msgids[-1])
+
+        # ...and the "first patch" of this new series
+        msgid = make_msgid()
+        email = self._create_email(msgid, msgids)
+        series = find_series(email)
+
+        # this should link to the second series - not the first
+        self.assertEqual(len(msgids), 4 + 1)  # old series + new cover
+        self.assertEqual(series, ref_v2.series)
+
+
 class SubjectEncodingTest(TestCase):
     """Validate correct handling of encoded subjects."""
 
@@ -692,24 +761,6 @@ class ParseInitialTagsTest(PatchTest):
             tag__name='Tested-by').count, 1)
 
 
-class PrefixTest(TestCase):
-
-    def test_split_prefixes(self):
-        self.assertEqual(split_prefixes('PATCH'), ['PATCH'])
-        self.assertEqual(split_prefixes('PATCH,RFC'), ['PATCH', 'RFC'])
-        self.assertEqual(split_prefixes(''), [])
-        self.assertEqual(split_prefixes('PATCH,'), ['PATCH'])
-        self.assertEqual(split_prefixes('PATCH '), ['PATCH'])
-        self.assertEqual(split_prefixes('PATCH,RFC'), ['PATCH', 'RFC'])
-        self.assertEqual(split_prefixes('PATCH 1/2'), ['PATCH', '1/2'])
-
-    def test_series_markers(self):
-        self.assertEqual(parse_series_marker([]), (None, None))
-        self.assertEqual(parse_series_marker(['bar']), (None, None))
-        self.assertEqual(parse_series_marker(['bar', '1/2']), (1, 2))
-        self.assertEqual(parse_series_marker(['bar', '0/12']), (0, 12))
-
-
 class SubjectTest(TestCase):
 
     def test_clean_subject(self):
@@ -748,3 +799,28 @@ class SubjectTest(TestCase):
         self.assertIsNotNone(subject_check('RE meep'))
         self.assertIsNotNone(subject_check('Re meep'))
         self.assertIsNotNone(subject_check('re meep'))
+
+    def test_split_prefixes(self):
+        self.assertEqual(split_prefixes('PATCH'), ['PATCH'])
+        self.assertEqual(split_prefixes('PATCH,RFC'), ['PATCH', 'RFC'])
+        self.assertEqual(split_prefixes(''), [])
+        self.assertEqual(split_prefixes('PATCH,'), ['PATCH'])
+        self.assertEqual(split_prefixes('PATCH '), ['PATCH'])
+        self.assertEqual(split_prefixes('PATCH,RFC'), ['PATCH', 'RFC'])
+        self.assertEqual(split_prefixes('PATCH 1/2'), ['PATCH', '1/2'])
+
+    def test_series_markers(self):
+        self.assertEqual(parse_series_marker([]), (None, None))
+        self.assertEqual(parse_series_marker(['bar']), (None, None))
+        self.assertEqual(parse_series_marker(['bar', '1/2']), (1, 2))
+        self.assertEqual(parse_series_marker(['bar', '0/12']), (0, 12))
+
+    def test_version(self):
+        self.assertEqual(parse_version('', []), 1)
+        self.assertEqual(parse_version('Hello, world', []), 1)
+        self.assertEqual(parse_version('Hello, world', ['version']), 1)
+        self.assertEqual(parse_version('Hello, world', ['v2']), 2)
+        self.assertEqual(parse_version('Hello, world', ['V6']), 6)
+        self.assertEqual(parse_version('Hello, world', ['v10']), 10)
+        self.assertEqual(parse_version('Hello, world (v2)', []), 2)
+        self.assertEqual(parse_version('Hello, world (V6)', []), 6)
diff --git a/patchwork/tests/utils.py b/patchwork/tests/utils.py
index 23b0c87..753e1ec 100644
--- a/patchwork/tests/utils.py
+++ b/patchwork/tests/utils.py
@@ -31,6 +31,8 @@ from patchwork.models import CoverLetter
 from patchwork.models import Patch
 from patchwork.models import Person
 from patchwork.models import Project
+from patchwork.models import SeriesReference
+from patchwork.models import SeriesRevision
 from patchwork.models import State
 from patchwork.tests import TEST_PATCH_DIR
 
@@ -217,6 +219,29 @@ def create_check(**kwargs):
     return Check.objects.create(**values)
 
 
+def create_series_revision(**kwargs):
+    """Create 'SeriesRevision' object."""
+    values = {
+        'date': dt.now(),
+        'submitter': create_person() if 'submitter' not in kwargs else None,
+        'total': 1,
+    }
+    values.update(**kwargs)
+
+    return SeriesRevision.objects.create(**values)
+
+
+def create_series_reference(**kwargs):
+    """Create 'SeriesReference' object."""
+    values = {
+        'series': create_series_revision() if 'series' not in kwargs else None,
+        'msgid': make_msgid(),
+    }
+    values.update(**kwargs)
+
+    return SeriesReference.objects.create(**values)
+
+
 def _create_submissions(create_func, count=1, **kwargs):
     """Create 'count' Submission-based objects.
 
-- 
2.7.4



More information about the Patchwork mailing list