Guessing project based on email address

Guilherme Salgado guilherme.salgado at linaro.org
Tue Feb 15 08:43:07 EST 2011


Hi folks,

We're going to use Patchwork to track Linaro patches, but instead of
subscribing to all mailing lists to which patches may be sent, we're
asking all Linaro developers to CC a common email address whenever they
send a patch upstream, and we'll then feed all mail delivered to that
address into Patchwork.

That means most of the messages won't have a List-ID header, but we
should still be able to lookup the correct Patchwork project for a given
message based on the email addresses of recipients, hence I'd like to
propose changing find_project() in apps/patchwork/bin/parsemail.py to
fallback to email address lookup when a project can't be found with the
List-ID in the message.

I think such a change wouldn't cause any harm to regular Patchwork
instances and may even be helpful if a mailing list's list-id is
changed.  I'm including a patch which does what I've described above,
just to illustrate, but if you guys think this is a sane change, I'll be
happy to send a properly formatted patch (as soon as I figure out how to
make git combine the patch and the cover letter in a single email ;).

Cheers,
Guilherme


diff --git a/apps/patchwork/bin/parsemail.py b/apps/patchwork/bin/parsemail.py
index 700cb6f..305ab95 100755
--- a/apps/patchwork/bin/parsemail.py
+++ b/apps/patchwork/bin/parsemail.py
@@ -55,7 +55,7 @@ def clean_header(header):
 
     return normalise_space(u' '.join(fragments))
 
-def find_project(mail):
+def find_project_by_listid(mail):
     project = None
     listid_res = [re.compile('.*<([^>]+)>.*', re.S),
                   re.compile('^([\S]+)$', re.S)]
@@ -81,6 +81,30 @@ def find_project(mail):
 
     return project
 
+def extract_email_addresses(str):
+    email_re = re.compile(
+        r"([_\.0-9a-zA-Z-+=]+@(([0-9a-zA-Z-]{1,}\.)*)[a-zA-Z]{2,})")
+    # re.findall() will return a list of tuples because we have multiple
+    # groups on the regex above, but we're only interested on the outermost
+    # group (which should contain the whole email address), so we drop the
+    # second and third groups.
+    return [email for email, dummy, dummy2 in email_re.findall(str)]
+
+def find_project_by_list_address(mail):
+    recipients = mail.get('To', '') + mail.get('CC', '')
+    for email_address in extract_email_addresses(recipients):
+        try:
+            return Project.objects.get(listemail = email_address)
+        except Project.DoesNotExist:
+            pass
+    return None
+
+def find_project(mail):
+    project = find_project_by_listid(mail)
+    if project is None:
+        project = find_project_by_list_address(mail)
+    return project
+
 def find_author(mail):
 
     from_header = clean_header(mail.get('From'))
diff --git a/apps/patchwork/tests/patchparser.py b/apps/patchwork/tests/patchparser.py
index ff0025a..d4d15d1 100644
--- a/apps/patchwork/tests/patchparser.py
+++ b/apps/patchwork/tests/patchparser.py
@@ -34,8 +34,9 @@ class PatchTest(unittest.TestCase):
     default_subject = defaults.subject
     project = defaults.project
 
-from patchwork.bin.parsemail import find_content, find_author, find_project, \
-                                    parse_mail
+from patchwork.bin.parsemail import (
+    extract_email_addresses, find_content, find_author, find_project,
+    parse_mail)
 
 class InlinePatchTest(PatchTest):
     patch_filename = '0001-add-line.patch'
@@ -276,18 +277,44 @@ class MultipleProjectPatchCommentTest(MultipleProjectPatchTest):
             # and the one we parsed in setUp()
             self.assertEquals(Comment.objects.filter(patch = patch).count(), 2)
 
-class ListIdHeaderTest(unittest.TestCase):
-    """ Test that we parse List-Id headers from mails correctly """
+class EmailProjectGuessing(unittest.TestCase):
+    """Projects are guessed based on List-Id headers or recipient addresses"""
     def setUp(self):
         self.project = Project(linkname = 'test-project-1', name = 'Project 1',
                 listid = '1.example.com', listemail='1 at example.com')
         self.project.save()
 
+    def testExtractingEmailAddressesFromRecipientsList(self):
+        emails = extract_email_addresses(
+            '"Foo Bar" <foo.bar at example.com>,'
+            '<baz+list at foo.example.com>,'
+            'bar-foo at bar.foo.example.com,'
+            # Notice that this one is not a valid email address.
+            'bar-foo at .com')
+        self.assertEqual(
+            ['foo.bar at example.com',
+             'baz+list at foo.example.com',
+             'bar-foo at bar.foo.example.com'],
+            emails)
+
     def testNoListId(self):
         email = MIMEText('')
         project = find_project(email)
         self.assertEquals(project, None)
 
+    def testNoListIdWithListEmailAsRecipient(self):
+        email = MIMEText('')
+        email['To'] = '"First dev list" <1 at example.com>'
+        project = find_project(email)
+        self.assertEquals(self.project, project)
+
+    def testNoListIdWithListEmailAsCC(self):
+        email = MIMEText('')
+        email['CC'] = ('"First maintainer <maintainer at example.com>, '
+                       '"First dev list" <1 at example.com>')
+        project = find_project(email)
+        self.assertEquals(self.project, project)
+
     def testBlankListId(self):
         email = MIMEText('')
         email['List-Id'] = ''


-- 
Guilherme Salgado <https://launchpad.net/~salgado>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 198 bytes
Desc: This is a digitally signed message part
URL: <http://lists.ozlabs.org/pipermail/patchwork/attachments/20110214/9530396b/attachment.pgp>


More information about the Patchwork mailing list