[PATCH v2 3/7] parser: Parse "Depends-on" tags in emails

Fri Nov 1 09:33:39 AEDT 2024

On Tue, 2024-07-30 at 17:05 -0400, Adam Hassick wrote:
> Add a new function to parse "Depends-on" tags to the parser. The value
> may either be the message ID of a patch or cover letter email already
> received by Patchwork, or the web URL of a patch or series. When this
> tag is found, the parser will add the series (or the series the patch
> belongs to) as a dependency to the series it is creating.
> 
> Signed-off-by: Adam Hassick <ahassick at iol.unh.edu>

Could of comments below but hopefully nothing major.

Stephen

> ---
>  patchwork/parser.py | 90 ++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 89 insertions(+), 1 deletion(-)
> 
> diff --git a/patchwork/parser.py b/patchwork/parser.py
> index 09a53a0..6408c5c 100644
> --- a/patchwork/parser.py
> +++ b/patchwork/parser.py
> @@ -14,11 +14,13 @@ from email.errors import HeaderParseError
>  from fnmatch import fnmatch
>  import logging
>  import re
> +from urllib.parse import urlparse
>  
>  from django.contrib.auth.models import User
>  from django.db.utils import IntegrityError
>  from django.db import transaction
>  from django.utils import timezone as tz_utils
> +from django.urls import resolve, Resolver404
>  
>  from patchwork.models import Cover
>  from patchwork.models import CoverComment
> @@ -32,7 +34,6 @@ from patchwork.models import Series
>  from patchwork.models import SeriesReference
>  from patchwork.models import State
>  
> -
>  _msgid_re = re.compile(r'<[^>]+>')
>  _hunk_re = re.compile(r'^\@\@ -\d+(?:,(\d+))? \+\d+(?:,(\d+))? \@\@')
>  _filename_re = re.compile(r'^(---|\+\+\+) (\S+)')
> @@ -1054,6 +1055,90 @@ def parse_pull_request(content):
>      return None
>  
>  
> +def find_series_from_url(url):
> +    """
> +    Get a series from either a series or patch URL.
> +    """
> +    parse_result = urlparse(url)
> +
> +    # Resolve the URL path to see if this is a patch or series detail URL.
> +    try:
> +        result = resolve(parse_result.path)
> +    except Resolver404:
> +        return None

nit: We're not great at logging, but this feels like something we could/should
log?

> +
> +    # TODO: Use the series detail view here.
> +    if result.view_name == 'patch-list' and parse_result.query:
> +        # Parse the query string.
> +        # This can be replaced with something much friendlier once the
> +        # series detail view is implemented.
> +        series_query_param = next(
> +            filter(
> +                lambda x: len(x) == 2 and x[0] == 'series',
> +                map(lambda x: x.split('='), parse_result.query.split('&')),
> +            )
> +        )

Can we use 'urllib.parse.parse_qs' here? [1]

[1] https://docs.python.org/3/library/urllib.parse.html#urllib.parse.parse_qs

> +
> +        if not series_query_param:
> +            return None
> +
> +        series_id = series_query_param[1]
> +
> +        try:
> +            series_id_num = int(series_id)
> +        except ValueError:
> +            return None

This is probably loggable also?

> +
> +        return Series.objects.filter(id=series_id_num).first()

For other reviewers (and a reminder to myself), this will return None if there
are no matches.

> +    elif result.view_name == 'patch-detail':
> +        msgid = Patch.decode_msgid(result.kwargs['msgid'])
> +        patch = Patch.objects.filter(msgid=msgid).first()
> +
> +        if patch:
> +            return patch.series

nit: you use the walrus operator below so...

  if patch := Patch.objects.filter(msgid=msgid).first():
      return patch.series

> +
> +
> +def find_series_from_msgid(msgid):
> +    """
> +    Get a series from
> +    """
> +    if patch := Patch.objects.filter(msgid=msgid).first():
> +        return patch.series
> +
> +    if cover := Cover.objects.filter(msgid=msgid).first():
> +        return cover.series
> +
> +
> +def parse_depends_on(content):
> +    """Parses any dependency hints in the patch or series content."""
> +    dependencies = []
> +
> +    # Discover dependencies given as URLs.
> +    dependencies.extend(
> +        series
> +        for url in re.findall(
> +            r'^Depends-on: (http[s]?:\/\/[\w\d\-.\/=&@:%?_\+()]+)\s*$',
> +            content,
> +            flags=re.MULTILINE | re.IGNORECASE,
> +        )
> +        if (series := find_series_from_url(url)) is not None
> +    )

This is clever, but a for loop would likely be a little easier to read. Any
chance you could rework this slightly? Ditto for the below.

> +
> +    # Discover dependencies given as message IDs.
> +    dependencies.extend(
> +        series
> +        for msgid in re.findall(
> +            r'^Depends-on: (<[^>]+>)\s*$',
> +            content,
> +            flags=re.MULTILINE | re.IGNORECASE,
> +        )
> +        if (series := find_series_from_msgid(msgid)) is not None
> +    )
> +
> +    # Return list of series objects to depend on.
> +    return dependencies
> +
> +
>  def find_state(mail):
>      """Return the state with the given name or the default."""
>      state_name = clean_header(mail.get('X-Patchwork-State', ''))
> @@ -1308,6 +1393,8 @@ def parse_mail(mail, list_id=None):
>              # always have a series
>              series.add_patch(patch, x)
>  
> +        series.add_dependencies(parse_depends_on(message))
> +
>          return patch
>      elif x == 0:  # (potential) cover letters
>          # if refs are empty, it's implicitly a cover letter. If not,
> @@ -1374,6 +1461,7 @@ def parse_mail(mail, list_id=None):
>              logger.debug('Cover letter saved')
>  
>              series.add_cover_letter(cover_letter)
> +            series.add_dependencies(parse_depends_on(message))
>  
>              return cover_letter
>