[PATCH] parser: Unmangle From: headers that have been mangled for DMARC purposes

Jonathan Nieder jrnieder at gmail.com
Fri Oct 11 06:41:32 AEDT 2019


Hi,

Andrew Donnellan wrote:

> To avoid triggering spam filters due to failed signature validation, many
> mailing lists mangle the From header to change the From address to be the
> address of the list, typically where the sender's domain has a strict DMARC
> policy enabled.
>
> In this case, we should try to unmangle the From header.
>
> Add support for using the X-Original-Sender or Reply-To headers, as used by
> Google Groups and Mailman respectively, to unmangle the From header when
> necessary.
>
> Closes: #64 ("Incorrect submitter when using googlegroups")
> Reported-by: Alexandre Belloni <alexandre.belloni at bootlin.com>
> Reported-by: Stephen Rothwell <sfr at canb.auug.org.au>
> Signed-off-by: Andrew Donnellan <ajd at linux.ibm.com>
> ---
>  patchwork/parser.py            | 75 ++++++++++++++++++++++++++++++----
>  patchwork/tests/test_parser.py | 68 ++++++++++++++++++++++++++++--
>  2 files changed, 130 insertions(+), 13 deletions(-)

Interesting!  I'm cc-ing the Git mailing list in case "git am" might
wnat to learn the same support.

Thanks,
Jonathan

(patch left unsnipped for reference)
> diff --git a/patchwork/parser.py b/patchwork/parser.py
> index 7dc66bc05a5b..79beac5617b1 100644
> --- a/patchwork/parser.py
> +++ b/patchwork/parser.py
> @@ -321,12 +321,7 @@ def find_series(project, mail, author):
>      return _find_series_by_markers(project, mail, author)
>  
>  
> -def get_or_create_author(mail):
> -    from_header = clean_header(mail.get('From'))
> -
> -    if not from_header:
> -        raise ValueError("Invalid 'From' header")
> -
> +def split_from_header(from_header):
>      name, email = (None, None)
>  
>      # tuple of (regex, fn)
> @@ -355,6 +350,65 @@ def get_or_create_author(mail):
>              (name, email) = fn(match.groups())
>              break
>  
> +    return (name, email)
> +
> +
> +# Unmangle From addresses that have been mangled for DMARC purposes.
> +#
> +# To avoid triggering spam filters due to failed signature validation, many
> +# mailing lists mangle the From header to change the From address to be the
> +# address of the list, typically where the sender's domain has a strict
> +# DMARC policy enabled.
> +#
> +# Unfortunately, there's no standardised way of preserving the original
> +# From address.
> +#
> +# Google Groups adds an X-Original-Sender header. If present, we use that.
> +#
> +# Mailman preserves the original address by adding a Reply-To, except in the
> +# case where the list is set to either reply to list, or reply to a specific
> +# address, in which case the original From is added to Cc instead. These corner
> +# cases are dumb, but we try and handle things as sensibly as possible by
> +# looking for a name in Reply-To/Cc that matches From. It's inexact but should
> +# be good enough for our purposes.
> +def get_original_sender(mail, name, email):
> +    if name and ' via ' in name:
> +        # Mailman uses the format "<name> via <list>"
> +        # Google Groups uses "'<name>' via <list>"
> +        stripped_name = name[:name.rfind(' via ')].strip().strip("'")
> +
> +    original_sender = clean_header(mail.get('X-Original-Sender', ''))
> +    if original_sender:
> +        new_email = split_from_header(original_sender)[1].strip()[:255]
> +        return (stripped_name, new_email)
> +
> +    addrs = []
> +    reply_to_headers = mail.get_all('Reply-To') or []
> +    cc_headers = mail.get_all('Cc') or []
> +    for header in reply_to_headers + cc_headers:
> +        header = clean_header(header)
> +        addrs = header.split(",")
> +        for addr in addrs:
> +            new_name, new_email = split_from_header(addr)
> +            if new_name:
> +                new_name = new_name.strip()[:255]
> +            if new_email:
> +                new_email = new_email.strip()[:255]
> +            if new_name == stripped_name:
> +                return (stripped_name, new_email)
> +
> +    # If we can't figure out the original sender, just keep it as is
> +    return (name, email)
> +
> +
> +def get_or_create_author(mail, project=None):
> +    from_header = clean_header(mail.get('From'))
> +
> +    if not from_header:
> +        raise ValueError("Invalid 'From' header")
> +
> +    name, email = split_from_header(from_header)
> +
>      if not email:
>          raise ValueError("Invalid 'From' header")
>  
> @@ -362,6 +416,9 @@ def get_or_create_author(mail):
>      if name is not None:
>          name = name.strip()[:255]
>  
> +    if project and email.lower() == project.listemail.lower():
> +        name, email = get_original_sender(mail, name, email)
> +
>      # this correctly handles the case where we lose the race to create
>      # the person and another process beats us to it. (If the record
>      # does not exist, g_o_c invokes _create_object_from_params which
> @@ -1004,7 +1061,7 @@ def parse_mail(mail, list_id=None):
>  
>      if not is_comment and (diff or pull_url):  # patches or pull requests
>          # we delay the saving until we know we have a patch.
> -        author = get_or_create_author(mail)
> +        author = get_or_create_author(mail, project)
>  
>          delegate = find_delegate_by_header(mail)
>          if not delegate and diff:
> @@ -1099,7 +1156,7 @@ def parse_mail(mail, list_id=None):
>                  is_cover_letter = True
>  
>          if is_cover_letter:
> -            author = get_or_create_author(mail)
> +            author = get_or_create_author(mail, project)
>  
>              # we don't use 'find_series' here as a cover letter will
>              # always be the first item in a thread, thus the references
> @@ -1159,7 +1216,7 @@ def parse_mail(mail, list_id=None):
>      if not submission:
>          return
>  
> -    author = get_or_create_author(mail)
> +    author = get_or_create_author(mail, project)
>  
>      try:
>          comment = Comment.objects.create(
> diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py
> index e5a2fca3044e..85c6e7550f93 100644
> --- a/patchwork/tests/test_parser.py
> +++ b/patchwork/tests/test_parser.py
> @@ -265,11 +265,23 @@ class SenderCorrelationTest(TestCase):
>      """
>  
>      @staticmethod
> -    def _create_email(from_header):
> +    def _create_email(from_header, reply_tos=None, ccs=None,
> +                      x_original_sender=None):
>          mail = 'Message-Id: %s\n' % make_msgid() + \
> -               'From: %s\n' % from_header + \
> -               'Subject: Tests\n\n'\
> -               'test\n'
> +               'From: %s\n' % from_header
> +
> +        if reply_tos:
> +            mail += 'Reply-To: %s\n' % ', '.join(reply_tos)
> +
> +        if ccs:
> +            mail += 'Cc: %s\n' % ', '.join(ccs)
> +
> +        if x_original_sender:
> +            mail += 'X-Original-Sender: %s\n' % x_original_sender
> +
> +        mail += 'Subject: Tests\n\n'\
> +            'test\n'
> +
>          return message_from_string(mail)
>  
>      def test_existing_sender(self):
> @@ -311,6 +323,54 @@ class SenderCorrelationTest(TestCase):
>          self.assertEqual(person_b._state.adding, False)
>          self.assertEqual(person_b.id, person_a.id)
>  
> +    def test_mailman_dmarc_munging(self):
> +        project = create_project()
> +        real_sender = 'Existing Sender <existing at example.com>'
> +        munged_sender = 'Existing Sender via List <{}>'.format(
> +            project.listemail)
> +        other_email = 'Other Person <other at example.com>'
> +
> +        # Unmunged author
> +        mail = self._create_email(real_sender)
> +        person_a = get_or_create_author(mail, project)
> +        person_a.save()
> +
> +        # Single Reply-To
> +        mail = self._create_email(munged_sender, [real_sender])
> +        person_b = get_or_create_author(mail, project)
> +        self.assertEqual(person_b._state.adding, False)
> +        self.assertEqual(person_b.id, person_a.id)
> +
> +        # Single Cc
> +        mail = self._create_email(munged_sender, [], [real_sender])
> +        person_b = get_or_create_author(mail, project)
> +        self.assertEqual(person_b._state.adding, False)
> +        self.assertEqual(person_b.id, person_a.id)
> +
> +        # Multiple Reply-Tos and Ccs
> +        mail = self._create_email(munged_sender, [other_email, real_sender],
> +                                  [other_email, other_email])
> +        person_b = get_or_create_author(mail, project)
> +        self.assertEqual(person_b._state.adding, False)
> +        self.assertEqual(person_b.id, person_a.id)
> +
> +    def test_google_dmarc_munging(self):
> +        project = create_project()
> +        real_sender = 'Existing Sender <existing at example.com>'
> +        munged_sender = "'Existing Sender' via List <{}>".format(
> +            project.listemail)
> +
> +        # Unmunged author
> +        mail = self._create_email(real_sender)
> +        person_a = get_or_create_author(mail, project)
> +        person_a.save()
> +
> +        # X-Original-Sender header
> +        mail = self._create_email(munged_sender, None, None, real_sender)
> +        person_b = get_or_create_author(mail, project)
> +        self.assertEqual(person_b._state.adding, False)
> +        self.assertEqual(person_b.id, person_a.id)
> +
>  
>  class SeriesCorrelationTest(TestCase):
>      """Validate correct behavior of find_series."""


More information about the Patchwork mailing list