[PATCH 04/11] parsemail: Move parsing to 'parser'
Stephen Finucane
stephen.finucane at intel.com
Wed Jul 13 19:40:53 AEST 2016
Separate the parsing of mails from the CLI aspects of parsemail. Since
there is already a 'parser' module, it makes sense to place the parsing
functionality here. This will allow for additional uses of this parsing
functionality in the future.
Signed-off-by: Stephen Finucane <stephen.finucane at intel.com>
---
patchwork/bin/parsearchive.py | 4 +-
patchwork/bin/parsemail.py | 523 +---------------------------------------
patchwork/parser.py | 526 +++++++++++++++++++++++++++++++++++++++-
patchwork/tests/test_parser.py | 17 +-
4 files changed, 536 insertions(+), 534 deletions(-)
diff --git a/patchwork/bin/parsearchive.py b/patchwork/bin/parsearchive.py
index 30bca13..8986b22 100755
--- a/patchwork/bin/parsearchive.py
+++ b/patchwork/bin/parsearchive.py
@@ -29,7 +29,7 @@ import mailbox
import django
-from patchwork.bin import parsemail
+from patchwork.parser import parse_mail
from patchwork import models
LOGGER = logging.getLogger(__name__)
@@ -55,7 +55,7 @@ def parse_mbox(path, list_id):
mbox = mailbox.mbox(path)
for msg in mbox:
try:
- obj = parsemail.parse_mail(msg, list_id)
+ obj = parse_mail(msg, list_id)
if obj:
results[type(obj)] += 1
else:
diff --git a/patchwork/bin/parsemail.py b/patchwork/bin/parsemail.py
index 56cd126..abcee04 100755
--- a/patchwork/bin/parsemail.py
+++ b/patchwork/bin/parsemail.py
@@ -22,29 +22,15 @@
from __future__ import absolute_import
import argparse
-import codecs
-import datetime
from email import message_from_file
-from email.header import Header, decode_header
-from email.utils import parsedate_tz, mktime_tz
-from fnmatch import fnmatch
-from functools import reduce
import logging
-import operator
-import re
import sys
import django
from django.conf import settings
-from django.contrib.auth.models import User
from django.utils.log import AdminEmailHandler
-from django.utils import six
-from django.utils.six.moves import map
-from patchwork.models import (Patch, Project, Person, Comment, State,
- DelegationRule, Submission, CoverLetter,
- get_default_initial_patch_state)
-from patchwork.parser import parse_patch, find_filenames
+from patchwork.parser import parse_mail
LOGGER = logging.getLogger(__name__)
@@ -56,513 +42,6 @@ VERBOSITY_LEVELS = {
'critical': logging.CRITICAL
}
-list_id_headers = ['List-ID', 'X-Mailing-List', 'X-list']
-
-
-def normalise_space(str):
- whitespace_re = re.compile(r'\s+')
- return whitespace_re.sub(' ', str).strip()
-
-
-def clean_header(header):
- """Decode (possibly non-ascii) headers."""
- def decode(fragment):
- (frag_str, frag_encoding) = fragment
- if frag_encoding:
- return frag_str.decode(frag_encoding)
- elif isinstance(frag_str, six.binary_type): # python 2
- return frag_str.decode()
- return frag_str
-
- fragments = list(map(decode, decode_header(header)))
-
- return normalise_space(u' '.join(fragments))
-
-
-def find_project_by_id(list_id):
- """Find a `project` object with given `list_id`."""
- project = None
- try:
- project = Project.objects.get(listid=list_id)
- except Project.DoesNotExist:
- pass
- return project
-
-
-def find_project_by_header(mail):
- project = None
- listid_res = [re.compile(r'.*<([^>]+)>.*', re.S),
- re.compile(r'^([\S]+)$', re.S)]
-
- for header in list_id_headers:
- if header in mail:
-
- for listid_re in listid_res:
- match = listid_re.match(mail.get(header))
- if match:
- break
-
- if not match:
- continue
-
- listid = match.group(1)
-
- project = find_project_by_id(listid)
- if project:
- break
-
- return project
-
-
-def find_author(mail):
-
- from_header = clean_header(mail.get('From'))
- name, email = (None, None)
-
- # tuple of (regex, fn)
- # - where fn returns a (name, email) tuple from the match groups resulting
- # from re.match().groups()
- from_res = [
- # for "Firstname Lastname" <example at example.com> style addresses
- (re.compile(r'"?(.*?)"?\s*<([^>]+)>'), (lambda g: (g[0], g[1]))),
-
- # for example at example.com (Firstname Lastname) style addresses
- (re.compile(r'"?(.*?)"?\s*\(([^\)]+)\)'), (lambda g: (g[1], g[0]))),
-
- # for example at example.com (Firstname Lastname) style addresses
- (re.compile(r'(.*?)\sat\s(.*?)\s*\(([^\)]+)\)'),
- (lambda g: (g[2], '@'.join(g[0:2])))),
-
- # everything else
- (re.compile(r'(.*)'), (lambda g: (None, g[0]))),
- ]
-
- for regex, fn in from_res:
- match = regex.match(from_header)
- if match:
- (name, email) = fn(match.groups())
- break
-
- if email is None:
- raise ValueError("Invalid 'From' header")
-
- email = email.strip()
- if name is not None:
- name = name.strip()
-
- try:
- person = Person.objects.get(email__iexact=email)
- if name: # use the latest provided name
- person.name = name
- except Person.DoesNotExist:
- person = Person(name=name, email=email)
-
- return person
-
-
-def find_date(mail):
- t = parsedate_tz(mail.get('Date', ''))
- if not t:
- return datetime.datetime.utcnow()
- return datetime.datetime.utcfromtimestamp(mktime_tz(t))
-
-
-def find_headers(mail):
- return reduce(operator.__concat__,
- ['%s: %s\n' % (k, Header(v, header_name=k,
- continuation_ws='\t').encode())
- for (k, v) in list(mail.items())])
-
-
-def find_pull_request(content):
- git_re = re.compile(r'^The following changes since commit.*' +
- r'^are available in the git repository at:\n'
- r'^\s*([\S]+://[^\n]+)$',
- re.DOTALL | re.MULTILINE)
- match = git_re.search(content)
- if match:
- return match.group(1)
- return None
-
-
-def find_references(mail):
- """Construct a list of possible reply message ids."""
- refs = []
-
- if 'In-Reply-To' in mail:
- refs.append(mail.get('In-Reply-To'))
-
- if 'References' in mail:
- rs = mail.get('References').split()
- rs.reverse()
- for r in rs:
- if r not in refs:
- refs.append(r)
-
- return refs
-
-
-def parse_series_marker(subject_prefixes):
- """Extract series markers from subject.
-
- Extract the markers of multi-patches series, i.e. 'x/n', from the
- provided subject series.
-
- Args:
- subject_prefixes: List of subject prefixes to extract markers
- from
-
- Returns:
- (x, n) if markers found, else (None, None)
- """
-
- regex = re.compile('^([0-9]+)/([0-9]+)$')
- for prefix in subject_prefixes:
- m = regex.match(prefix)
- if not m:
- continue
- return (int(m.group(1)), int(m.group(2)))
- return (None, None)
-
-
-def find_content(project, mail):
- patchbuf = None
- commentbuf = ''
-
- for part in mail.walk():
- if part.get_content_maintype() != 'text':
- continue
-
- payload = part.get_payload(decode=True)
- subtype = part.get_content_subtype()
-
- if not isinstance(payload, six.text_type):
- charset = part.get_content_charset()
-
- # Check that we have a charset that we understand. Otherwise,
- # ignore it and fallback to our standard set.
- if charset is not None:
- try:
- codecs.lookup(charset)
- except LookupError:
- charset = None
-
- # If there is no charset or if it is unknown, then try some common
- # charsets before we fail.
- if charset is None:
- try_charsets = ['utf-8', 'windows-1252', 'iso-8859-1']
- else:
- try_charsets = [charset]
-
- for cset in try_charsets:
- try:
- payload = six.text_type(payload, cset)
- break
- except UnicodeDecodeError:
- payload = None
-
- # Could not find a valid decoded payload. Fail.
- if payload is None:
- return None, None
-
- if subtype in ['x-patch', 'x-diff']:
- patchbuf = payload
- elif subtype == 'plain':
- c = payload
-
- if not patchbuf:
- patchbuf, c = parse_patch(payload)
-
- if c is not None:
- commentbuf += c.strip() + '\n'
-
- commentbuf = clean_content(commentbuf)
-
- return patchbuf, commentbuf
-
-
-def find_submission_for_comment(project, refs):
- for ref in refs:
- # first, check for a direct reply
- try:
- submission = Submission.objects.get(project=project, msgid=ref)
- return submission
- except Submission.DoesNotExist:
- pass
-
- # see if we have comments that refer to a patch
- try:
- comment = Comment.objects.get(submission__project=project,
- msgid=ref)
- return comment.submission
- except Comment.MultipleObjectsReturned:
- # NOTE(stephenfin): This is a artifact of prior lack of support
- # for cover letters in Patchwork. Previously all replies to
- # patches were saved as comments. However, it's possible that
- # someone could have created a new series as a reply to one of the
- # comments on the original patch series. For example,
- # '2015-November/002096.html' from the Patchwork archives. In this
- # case, reparsing the archives will result in creation of a cover
- # letter with the same message ID as the existing comment. Follow
- # up comments will then apply to both this cover letter and the
- # linked patch from the comment previously created. We choose to
- # apply the comment to the cover letter. Note that this only
- # happens when running 'parsearchive' or similar, so it should not
- # affect every day use in any way.
- comments = Comment.objects.filter(submission__project=project,
- msgid=ref)
- # The latter item will be the cover letter
- return comments.reverse()[0].submission
- except Comment.DoesNotExist:
- pass
-
- return None
-
-
-def split_prefixes(prefix):
- """Turn a prefix string into a list of prefix tokens."""
- split_re = re.compile(r'[,\s]+')
- matches = split_re.split(prefix)
-
- return [s for s in matches if s != '']
-
-
-def clean_subject(subject, drop_prefixes=None):
- """Clean a Subject: header from an incoming patch.
-
- Removes Re: and Fwd: strings, as well as [PATCH]-style prefixes. By
- default, only [PATCH] is removed, and we keep any other bracketed
- data in the subject. If drop_prefixes is provided, remove those
- too, comparing case-insensitively.
-
- Args:
- subject: Subject to be cleaned
- drop_prefixes: Additional, case-insensitive prefixes to remove
- from the subject
- """
- re_re = re.compile(r'^(re|fwd?)[:\s]\s*', re.I)
- prefix_re = re.compile(r'^\[([^\]]*)\]\s*(.*)$')
- subject = clean_header(subject)
-
- if drop_prefixes is None:
- drop_prefixes = []
- else:
- drop_prefixes = [s.lower() for s in drop_prefixes]
-
- drop_prefixes.append('patch')
-
- # remove Re:, Fwd:, etc
- subject = re_re.sub(' ', subject)
-
- subject = normalise_space(subject)
-
- prefixes = []
-
- match = prefix_re.match(subject)
-
- while match:
- prefix_str = match.group(1)
- prefixes += [p for p in split_prefixes(prefix_str)
- if p.lower() not in drop_prefixes]
-
- subject = match.group(2)
- match = prefix_re.match(subject)
-
- subject = normalise_space(subject)
-
- subject = subject.strip()
- if prefixes:
- subject = '[%s] %s' % (','.join(prefixes), subject)
-
- return (subject, prefixes)
-
-
-def clean_content(content):
- """Remove cruft from the email message.
-
- Catch signature (-- ) and list footer (_____) cruft.
- """
- sig_re = re.compile(r'^(-- |_+)\n.*', re.S | re.M)
- content = sig_re.sub('', content)
-
- return content.strip()
-
-
-def find_state(mail):
- """Return the state with the given name or the default."""
- state_name = mail.get('X-Patchwork-State', '').strip()
- if state_name:
- try:
- return State.objects.get(name__iexact=state_name)
- except State.DoesNotExist:
- pass
- return get_default_initial_patch_state()
-
-
-def auto_delegate(project, filenames):
- if not filenames:
- return None
-
- rules = list(DelegationRule.objects.filter(project=project))
-
- patch_delegate = None
-
- for filename in filenames:
- file_delegate = None
- for rule in rules:
- if fnmatch(filename, rule.path):
- file_delegate = rule.user
- break
-
- if file_delegate is None:
- return None
-
- if patch_delegate is not None and file_delegate != patch_delegate:
- return None
-
- patch_delegate = file_delegate
-
- return patch_delegate
-
-
-def find_delegate(mail):
- """Return the delegate with the given email or None."""
- delegate_email = mail.get('X-Patchwork-Delegate', '').strip()
- if delegate_email:
- try:
- return User.objects.get(email__iexact=delegate_email)
- except User.DoesNotExist:
- pass
- return None
-
-
-def parse_mail(mail, list_id=None):
- """Parse a mail and add to the database.
-
- Args:
- mail (`mbox.Mail`): Mail to parse and add.
- list_id (str): Mailing list ID
-
- Returns:
- None
- """
- # some basic sanity checks
- if 'From' not in mail:
- raise ValueError("Missing 'From' header")
-
- if 'Subject' not in mail:
- raise ValueError("Missing 'Subject' header")
-
- if 'Message-Id' not in mail:
- raise ValueError("Missing 'Message-Id' header")
-
- hint = mail.get('X-Patchwork-Hint', '').lower()
- if hint == 'ignore':
- LOGGER.debug("Ignoring email due to 'ignore' hint")
- return
-
- if list_id:
- project = find_project_by_id(list_id)
- else:
- project = find_project_by_header(mail)
-
- if project is None:
- LOGGER.error('Failed to find a project for email')
- return
-
- # parse content
-
- diff, message = find_content(project, mail)
-
- if not (diff or message):
- return # nothing to work with
-
- msgid = mail.get('Message-Id').strip()
- author = find_author(mail)
- name, prefixes = clean_subject(mail.get('Subject'), [project.linkname])
- x, n = parse_series_marker(prefixes)
- refs = find_references(mail)
- date = find_date(mail)
- headers = find_headers(mail)
- pull_url = find_pull_request(message)
-
- # build objects
-
- if diff or pull_url: # patches or pull requests
- # we delay the saving until we know we have a patch.
- author.save()
-
- delegate = find_delegate(mail)
- if not delegate and diff:
- filenames = find_filenames(diff)
- delegate = auto_delegate(project, filenames)
-
- patch = Patch(
- msgid=msgid,
- project=project,
- name=name,
- date=date,
- headers=headers,
- submitter=author,
- content=message,
- diff=diff,
- pull_url=pull_url,
- delegate=delegate,
- state=find_state(mail))
- patch.save()
- LOGGER.debug('Patch saved')
-
- return patch
- elif x == 0: # (potential) cover letters
- # if refs are empty, it's implicitly a cover letter. If not,
- # however, we need to see if a match already exists and, if
- # not, assume that it is indeed a new cover letter
- is_cover_letter = False
- if not refs == []:
- try:
- CoverLetter.objects.all().get(name=name)
- except CoverLetter.DoesNotExist: # no match => new cover
- is_cover_letter = True
- else:
- is_cover_letter = True
-
- if is_cover_letter:
- author.save()
-
- cover_letter = CoverLetter(
- msgid=msgid,
- project=project,
- name=name,
- date=date,
- headers=headers,
- submitter=author,
- content=message)
- cover_letter.save()
- LOGGER.debug('Cover letter saved')
-
- return cover_letter
-
- # comments
-
- # we only save comments if we have the parent email
- submission = find_submission_for_comment(project, refs)
- if not submission:
- return
-
- author.save()
-
- comment = Comment(
- submission=submission,
- msgid=msgid,
- date=date,
- headers=headers,
- submitter=author,
- content=message)
- comment.save()
- LOGGER.debug('Comment saved')
-
- return comment
-
extra_error_message = '''
== Mail
diff --git a/patchwork/parser.py b/patchwork/parser.py
index c9c058d..938b965 100644
--- a/patchwork/parser.py
+++ b/patchwork/parser.py
@@ -19,13 +19,359 @@
# along with Patchwork; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+import codecs
+import datetime
+from email.header import Header, decode_header
+from email.utils import parsedate_tz, mktime_tz
+from fnmatch import fnmatch
+from functools import reduce
+import logging
+import operator
import re
+from django.contrib.auth.models import User
+from django.utils import six
from django.utils.six.moves import map
+from patchwork.models import (Patch, Project, Person, Comment, State,
+ DelegationRule, Submission, CoverLetter,
+ get_default_initial_patch_state)
-_hunk_re = re.compile('^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
-_filename_re = re.compile('^(---|\+\+\+) (\S+)')
+
+_hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
+_filename_re = re.compile(r'^(---|\+\+\+) (\S+)')
+list_id_headers = ['List-ID', 'X-Mailing-List', 'X-list']
+
+LOGGER = logging.getLogger(__name__)
+
+
+def normalise_space(str):
+ whitespace_re = re.compile(r'\s+')
+ return whitespace_re.sub(' ', str).strip()
+
+
+def clean_header(header):
+ """Decode (possibly non-ascii) headers."""
+ def decode(fragment):
+ (frag_str, frag_encoding) = fragment
+ if frag_encoding:
+ return frag_str.decode(frag_encoding)
+ elif isinstance(frag_str, six.binary_type): # python 2
+ return frag_str.decode()
+ return frag_str
+
+ fragments = list(map(decode, decode_header(header)))
+
+ return normalise_space(u' '.join(fragments))
+
+
+def find_project_by_id(list_id):
+ """Find a `project` object with given `list_id`."""
+ project = None
+ try:
+ project = Project.objects.get(listid=list_id)
+ except Project.DoesNotExist:
+ pass
+ return project
+
+
+def find_project_by_header(mail):
+ project = None
+ listid_res = [re.compile(r'.*<([^>]+)>.*', re.S),
+ re.compile(r'^([\S]+)$', re.S)]
+
+ for header in list_id_headers:
+ if header in mail:
+
+ for listid_re in listid_res:
+ match = listid_re.match(mail.get(header))
+ if match:
+ break
+
+ if not match:
+ continue
+
+ listid = match.group(1)
+
+ project = find_project_by_id(listid)
+ if project:
+ break
+
+ return project
+
+
+def find_author(mail):
+ from_header = clean_header(mail.get('From'))
+ name, email = (None, None)
+
+ # tuple of (regex, fn)
+ # - where fn returns a (name, email) tuple from the match groups resulting
+ # from re.match().groups()
+ from_res = [
+ # for "Firstname Lastname" <example at example.com> style addresses
+ (re.compile(r'"?(.*?)"?\s*<([^>]+)>'), (lambda g: (g[0], g[1]))),
+
+ # for example at example.com (Firstname Lastname) style addresses
+ (re.compile(r'"?(.*?)"?\s*\(([^\)]+)\)'), (lambda g: (g[1], g[0]))),
+
+ # for example at example.com (Firstname Lastname) style addresses
+ (re.compile(r'(.*?)\sat\s(.*?)\s*\(([^\)]+)\)'),
+ (lambda g: (g[2], '@'.join(g[0:2])))),
+
+ # everything else
+ (re.compile(r'(.*)'), (lambda g: (None, g[0]))),
+ ]
+
+ for regex, fn in from_res:
+ match = regex.match(from_header)
+ if match:
+ (name, email) = fn(match.groups())
+ break
+
+ if email is None:
+ raise ValueError("Invalid 'From' header")
+
+ email = email.strip()
+ if name is not None:
+ name = name.strip()
+
+ try:
+ person = Person.objects.get(email__iexact=email)
+ if name: # use the latest provided name
+ person.name = name
+ except Person.DoesNotExist:
+ person = Person(name=name, email=email)
+
+ return person
+
+
+def find_date(mail):
+ t = parsedate_tz(mail.get('Date', ''))
+ if not t:
+ return datetime.datetime.utcnow()
+ return datetime.datetime.utcfromtimestamp(mktime_tz(t))
+
+
+def find_headers(mail):
+ return reduce(operator.__concat__,
+ ['%s: %s\n' % (k, Header(v, header_name=k,
+ continuation_ws='\t').encode())
+ for (k, v) in list(mail.items())])
+
+
+def find_pull_request(content):
+ git_re = re.compile(r'^The following changes since commit.*' +
+ r'^are available in the git repository at:\n'
+ r'^\s*([\S]+://[^\n]+)$',
+ re.DOTALL | re.MULTILINE)
+ match = git_re.search(content)
+ if match:
+ return match.group(1)
+ return None
+
+
+def find_references(mail):
+ """Construct a list of possible reply message ids."""
+ refs = []
+
+ if 'In-Reply-To' in mail:
+ refs.append(mail.get('In-Reply-To'))
+
+ if 'References' in mail:
+ rs = mail.get('References').split()
+ rs.reverse()
+ for r in rs:
+ if r not in refs:
+ refs.append(r)
+
+ return refs
+
+
+def parse_series_marker(subject_prefixes):
+ """Extract series markers from subject.
+
+ Extract the markers of multi-patches series, i.e. 'x/n', from the
+ provided subject series.
+
+ Args:
+ subject_prefixes: List of subject prefixes to extract markers
+ from
+
+ Returns:
+ (x, n) if markers found, else (None, None)
+ """
+
+ regex = re.compile('^([0-9]+)/([0-9]+)$')
+ for prefix in subject_prefixes:
+ m = regex.match(prefix)
+ if not m:
+ continue
+ return (int(m.group(1)), int(m.group(2)))
+ return (None, None)
+
+
+def find_content(project, mail):
+ """Extract a comment and potential diff from a mail."""
+ patchbuf = None
+ commentbuf = ''
+
+ for part in mail.walk():
+ if part.get_content_maintype() != 'text':
+ continue
+
+ payload = part.get_payload(decode=True)
+ subtype = part.get_content_subtype()
+
+ if not isinstance(payload, six.text_type):
+ charset = part.get_content_charset()
+
+ # Check that we have a charset that we understand. Otherwise,
+ # ignore it and fallback to our standard set.
+ if charset is not None:
+ try:
+ codecs.lookup(charset)
+ except LookupError:
+ charset = None
+
+ # If there is no charset or if it is unknown, then try some common
+ # charsets before we fail.
+ if charset is None:
+ try_charsets = ['utf-8', 'windows-1252', 'iso-8859-1']
+ else:
+ try_charsets = [charset]
+
+ for cset in try_charsets:
+ try:
+ payload = six.text_type(payload, cset)
+ break
+ except UnicodeDecodeError:
+ payload = None
+
+ # Could not find a valid decoded payload. Fail.
+ if payload is None:
+ return None, None
+
+ if subtype in ['x-patch', 'x-diff']:
+ patchbuf = payload
+ elif subtype == 'plain':
+ c = payload
+
+ if not patchbuf:
+ patchbuf, c = parse_patch(payload)
+
+ if c is not None:
+ commentbuf += c.strip() + '\n'
+
+ commentbuf = clean_content(commentbuf)
+
+ return patchbuf, commentbuf
+
+
+def find_submission_for_comment(project, refs):
+ for ref in refs:
+ # first, check for a direct reply
+ try:
+ submission = Submission.objects.get(project=project, msgid=ref)
+ return submission
+ except Submission.DoesNotExist:
+ pass
+
+ # see if we have comments that refer to a patch
+ try:
+ comment = Comment.objects.get(submission__project=project,
+ msgid=ref)
+ return comment.submission
+ except Comment.MultipleObjectsReturned:
+ # NOTE(stephenfin): This is a artifact of prior lack of support
+ # for cover letters in Patchwork. Previously all replies to
+ # patches were saved as comments. However, it's possible that
+ # someone could have created a new series as a reply to one of the
+ # comments on the original patch series. For example,
+ # '2015-November/002096.html' from the Patchwork archives. In this
+ # case, reparsing the archives will result in creation of a cover
+ # letter with the same message ID as the existing comment. Follow
+ # up comments will then apply to both this cover letter and the
+ # linked patch from the comment previously created. We choose to
+ # apply the comment to the cover letter. Note that this only
+ # happens when running 'parsearchive' or similar, so it should not
+ # affect every day use in any way.
+ comments = Comment.objects.filter(submission__project=project,
+ msgid=ref)
+ # The latter item will be the cover letter
+ return comments.reverse()[0].submission
+ except Comment.DoesNotExist:
+ pass
+
+ return None
+
+
+def split_prefixes(prefix):
+ """Turn a prefix string into a list of prefix tokens."""
+ split_re = re.compile(r'[,\s]+')
+ matches = split_re.split(prefix)
+
+ return [s for s in matches if s != '']
+
+
+def clean_subject(subject, drop_prefixes=None):
+ """Clean a Subject: header from an incoming patch.
+
+ Removes Re: and Fwd: strings, as well as [PATCH]-style prefixes. By
+ default, only [PATCH] is removed, and we keep any other bracketed
+ data in the subject. If drop_prefixes is provided, remove those
+ too, comparing case-insensitively.
+
+ Args:
+ subject: Subject to be cleaned
+ drop_prefixes: Additional, case-insensitive prefixes to remove
+ from the subject
+ """
+ re_re = re.compile(r'^(re|fwd?)[:\s]\s*', re.I)
+ prefix_re = re.compile(r'^\[([^\]]*)\]\s*(.*)$')
+ subject = clean_header(subject)
+
+ if drop_prefixes is None:
+ drop_prefixes = []
+ else:
+ drop_prefixes = [s.lower() for s in drop_prefixes]
+
+ drop_prefixes.append('patch')
+
+ # remove Re:, Fwd:, etc
+ subject = re_re.sub(' ', subject)
+
+ subject = normalise_space(subject)
+
+ prefixes = []
+
+ match = prefix_re.match(subject)
+
+ while match:
+ prefix_str = match.group(1)
+ prefixes += [p for p in split_prefixes(prefix_str)
+ if p.lower() not in drop_prefixes]
+
+ subject = match.group(2)
+ match = prefix_re.match(subject)
+
+ subject = normalise_space(subject)
+
+ subject = subject.strip()
+ if prefixes:
+ subject = '[%s] %s' % (','.join(prefixes), subject)
+
+ return (subject, prefixes)
+
+
+def clean_content(content):
+ """Remove cruft from the email message.
+
+ Catch signature (-- ) and list footer (_____) cruft.
+ """
+ sig_re = re.compile(r'^(-- |_+)\n.*', re.S | re.M)
+ content = sig_re.sub('', content)
+
+ return content.strip()
def parse_patch(content):
@@ -181,6 +527,182 @@ def parse_patch(content):
return patchbuf, commentbuf
+def find_state(mail):
+ """Return the state with the given name or the default."""
+ state_name = mail.get('X-Patchwork-State', '').strip()
+ if state_name:
+ try:
+ return State.objects.get(name__iexact=state_name)
+ except State.DoesNotExist:
+ pass
+ return get_default_initial_patch_state()
+
+
+def auto_delegate(project, filenames):
+ if not filenames:
+ return None
+
+ rules = list(DelegationRule.objects.filter(project=project))
+
+ patch_delegate = None
+
+ for filename in filenames:
+ file_delegate = None
+ for rule in rules:
+ if fnmatch(filename, rule.path):
+ file_delegate = rule.user
+ break
+
+ if file_delegate is None:
+ return None
+
+ if patch_delegate is not None and file_delegate != patch_delegate:
+ return None
+
+ patch_delegate = file_delegate
+
+ return patch_delegate
+
+
+def find_delegate(mail):
+ """Return the delegate with the given email or None."""
+ delegate_email = mail.get('X-Patchwork-Delegate', '').strip()
+ if delegate_email:
+ try:
+ return User.objects.get(email__iexact=delegate_email)
+ except User.DoesNotExist:
+ pass
+ return None
+
+
+def parse_mail(mail, list_id=None):
+ """Parse a mail and add to the database.
+
+ Args:
+ mail (`mbox.Mail`): Mail to parse and add.
+ list_id (str): Mailing list ID
+
+ Returns:
+ None
+ """
+ # some basic sanity checks
+ if 'From' not in mail:
+ raise ValueError("Missing 'From' header")
+
+ if 'Subject' not in mail:
+ raise ValueError("Missing 'Subject' header")
+
+ if 'Message-Id' not in mail:
+ raise ValueError("Missing 'Message-Id' header")
+
+ hint = mail.get('X-Patchwork-Hint', '').lower()
+ if hint == 'ignore':
+ LOGGER.debug("Ignoring email due to 'ignore' hint")
+ return
+
+ if list_id:
+ project = find_project_by_id(list_id)
+ else:
+ project = find_project_by_header(mail)
+
+ if project is None:
+ LOGGER.error('Failed to find a project for email')
+ return
+
+ # parse content
+
+ diff, message = find_content(project, mail)
+
+ if not (diff or message):
+ return # nothing to work with
+
+ msgid = mail.get('Message-Id').strip()
+ author = find_author(mail)
+ name, prefixes = clean_subject(mail.get('Subject'), [project.linkname])
+ x, n = parse_series_marker(prefixes)
+ refs = find_references(mail)
+ date = find_date(mail)
+ headers = find_headers(mail)
+ pull_url = find_pull_request(message)
+
+ # build objects
+
+ if diff or pull_url: # patches or pull requests
+ # we delay the saving until we know we have a patch.
+ author.save()
+
+ delegate = find_delegate(mail)
+ if not delegate and diff:
+ filenames = find_filenames(diff)
+ delegate = auto_delegate(project, filenames)
+
+ patch = Patch(
+ msgid=msgid,
+ project=project,
+ name=name,
+ date=date,
+ headers=headers,
+ submitter=author,
+ content=message,
+ diff=diff,
+ pull_url=pull_url,
+ delegate=delegate,
+ state=find_state(mail))
+ patch.save()
+ LOGGER.debug('Patch saved')
+
+ return patch
+ elif x == 0: # (potential) cover letters
+ # if refs are empty, it's implicitly a cover letter. If not,
+ # however, we need to see if a match already exists and, if
+ # not, assume that it is indeed a new cover letter
+ is_cover_letter = False
+ if not refs == []:
+ try:
+ CoverLetter.objects.all().get(name=name)
+ except CoverLetter.DoesNotExist: # no match => new cover
+ is_cover_letter = True
+ else:
+ is_cover_letter = True
+
+ if is_cover_letter:
+ author.save()
+
+ cover_letter = CoverLetter(
+ msgid=msgid,
+ project=project,
+ name=name,
+ date=date,
+ headers=headers,
+ submitter=author,
+ content=message)
+ cover_letter.save()
+ LOGGER.debug('Cover letter saved')
+
+ return cover_letter
+
+ # comments
+
+ # we only save comments if we have the parent email
+ submission = find_submission_for_comment(project, refs)
+ if not submission:
+ return
+
+ author.save()
+
+ comment = Comment(
+ submission=submission,
+ msgid=msgid,
+ date=date,
+ headers=headers,
+ submitter=author,
+ content=message)
+ comment.save()
+ LOGGER.debug('Comment saved')
+
+ return comment
+
+
def find_filenames(diff):
"""Find files changes in a given diff."""
# normalise spaces
diff --git a/patchwork/tests/test_parser.py b/patchwork/tests/test_parser.py
index eca05a0..684a667 100644
--- a/patchwork/tests/test_parser.py
+++ b/patchwork/tests/test_parser.py
@@ -26,18 +26,19 @@ import os
from django.test import TestCase
-from patchwork.bin.parsemail import clean_subject
-from patchwork.bin.parsemail import find_author
-from patchwork.bin.parsemail import find_content
-from patchwork.bin.parsemail import find_project_by_header
-from patchwork.bin.parsemail import find_pull_request
-from patchwork.bin.parsemail import parse_mail as _parse_mail
-from patchwork.bin.parsemail import parse_series_marker
-from patchwork.bin.parsemail import split_prefixes
from patchwork.models import Comment
from patchwork.models import Patch
from patchwork.models import Person
from patchwork.models import State
+from patchwork.parser import clean_subject
+from patchwork.parser import find_author
+from patchwork.parser import find_content
+from patchwork.parser import find_project_by_header
+from patchwork.parser import find_pull_request
+from patchwork.parser import parse_mail as _parse_mail
+from patchwork.parser import parse_series_marker
+from patchwork.parser import split_prefixes
+from patchwork.tests.utils import create_email
from patchwork.tests.utils import create_project
from patchwork.tests.utils import create_state
from patchwork.tests.utils import create_user
--
1.7.4.1
More information about the Patchwork
mailing list