[PATCH RESEND 1/2] Rework tagging infrastructure
Stephen Finucane
stephen at that.guru
Wed Sep 12 03:55:46 AEST 2018
On Mon, 2018-07-09 at 18:10 +0200, vkabatov at redhat.com wrote:
> From: Veronika Kabatova <vkabatov at redhat.com>
>
> Solve #113 and #57 GitHub issues, fix up returning tags in the API,
> keep track of tag origin to be able to add tags to comments in the API.
>
> Use relations Tag-Patch and Tag-CoverLetter to avoid duplication of
> tags for each patch in series, and use `series` attribute of
> SubmissionTag as a notion of a tag which is related to each patch in the
> series (because it comes from cover letter or it's comments)
>
> Signed-off-by: Veronika Kabatova <vkabatov at redhat.com>
To start, apologies if I'm repeating myself on any of the comments
below: it's been a long time since I first reviewed this (sorry!).
Also, for the next revision of this, this can definitely be broken up
on purpose :) Given that tags aren't currently shown in the API, I'd
move the API changes into a separate patch. Similarly, docs (though not
release notes) aren't needed and can be broken up and moved into a
separate patch. Now to the review...
> ---
> patchwork/api/comment.py | 18 ++++-
> patchwork/api/cover.py | 19 ++++-
> patchwork/api/filters.py | 68 +++++++++++++++-
> patchwork/api/patch.py | 18 ++++-
> patchwork/models.py | 173 ++++++++++++++++------------------------
> patchwork/templatetags/patch.py | 3 +-
> patchwork/views/__init__.py | 3 -
> patchwork/views/utils.py | 10 ++-
> 8 files changed, 191 insertions(+), 121 deletions(-)
>
> diff --git a/patchwork/api/comment.py b/patchwork/api/comment.py
> index 5a5adb1..e03207e 100644
> --- a/patchwork/api/comment.py
> +++ b/patchwork/api/comment.py
> @@ -26,6 +26,7 @@ from patchwork.api.base import BaseHyperlinkedModelSerializer
> from patchwork.api.base import PatchworkPermission
> from patchwork.api.embedded import PersonSerializer
> from patchwork.models import Comment
> +from patchwork.models import SubmissionTag
>
>
> class CommentListSerializer(BaseHyperlinkedModelSerializer):
> @@ -34,6 +35,7 @@ class CommentListSerializer(BaseHyperlinkedModelSerializer):
> subject = SerializerMethodField()
> headers = SerializerMethodField()
> submitter = PersonSerializer(read_only=True)
> + tags = SerializerMethodField()
>
> def get_web_url(self, instance):
> request = self.context.get('request')
> @@ -43,6 +45,20 @@ class CommentListSerializer(BaseHyperlinkedModelSerializer):
> return email.parser.Parser().parsestr(comment.headers,
> True).get('Subject', '')
>
> + def get_tags(self, instance):
> + if not instance.submission.project.use_tags:
> + return {}
> +
> + tags = SubmissionTag.objects.filter(
> + comment=instance
> + ).values_list('tag__name', 'value')
> +
> + result = {}
> + for name, value in tags:
> + result.setdefault(name, []).append(value)
> +
> + return result
This smells of something that should be placed in models.py. Is there
any reason we _can't_ do it there, e.g. via a property?
> +
> def get_headers(self, comment):
> headers = {}
>
> @@ -60,7 +76,7 @@ class CommentListSerializer(BaseHyperlinkedModelSerializer):
> class Meta:
> model = Comment
> fields = ('id', 'web_url', 'msgid', 'date', 'subject', 'submitter',
> - 'content', 'headers')
> + 'content', 'headers', 'tags')
> read_only_fields = fields
> versioned_fields = {
> '1.1': ('web_url', ),
You definitely need changes to 'get_queryset' here. I can help you work
on these but tl;dr: look at what we do for 'check_set' in
'patchwork/api/patch.py'.
> diff --git a/patchwork/api/cover.py b/patchwork/api/cover.py
> index b497fd8..b17b9f7 100644
> --- a/patchwork/api/cover.py
> +++ b/patchwork/api/cover.py
> @@ -30,6 +30,7 @@ from patchwork.api.embedded import PersonSerializer
> from patchwork.api.embedded import ProjectSerializer
> from patchwork.api.embedded import SeriesSerializer
> from patchwork.models import CoverLetter
> +from patchwork.models import SubmissionTag
>
>
> class CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
> @@ -40,6 +41,7 @@ class CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
> mbox = SerializerMethodField()
> series = SeriesSerializer(many=True, read_only=True)
> comments = SerializerMethodField()
> + tags = SerializerMethodField()
>
> def get_web_url(self, instance):
> request = self.context.get('request')
> @@ -53,13 +55,26 @@ class CoverLetterListSerializer(BaseHyperlinkedModelSerializer):
> return self.context.get('request').build_absolute_uri(
> reverse('api-cover-comment-list', kwargs={'pk': cover.id}))
>
> + def get_tags(self, instance):
> + if not instance.project.use_tags:
> + return {}
> +
> + tags = SubmissionTag.objects.filter(
> + submission=instance).values_list('tag__name', 'value').distinct()
> +
> + result = {}
> + for name, value in tags:
> + result.setdefault(name, []).append(value)
> +
> + return result
> +
As above, this should be a property on the model, I imagine.
> class Meta:
> model = CoverLetter
> fields = ('id', 'url', 'web_url', 'project', 'msgid', 'date', 'name',
> - 'submitter', 'mbox', 'series', 'comments')
> + 'submitter', 'mbox', 'series', 'comments', 'tags')
> read_only_fields = fields
> versioned_fields = {
> - '1.1': ('web_url', 'mbox', 'comments'),
> + '1.1': ('web_url', 'mbox', 'comments', 'tags'),
This should be '1.2' now.
> }
> extra_kwargs = {
> 'url': {'view_name': 'api-cover-detail'},
Same comments RE: 'get_queryset'.
> diff --git a/patchwork/api/filters.py b/patchwork/api/filters.py
> index 73353d9..e213181 100644
> --- a/patchwork/api/filters.py
> +++ b/patchwork/api/filters.py
> @@ -21,9 +21,11 @@ from django.contrib.auth.models import User
> from django.core.exceptions import ValidationError
> from django.db.models import Q
> from django_filters.rest_framework import FilterSet
> +from django_filters import Filter
> from django_filters import IsoDateTimeFilter
> from django_filters import ModelMultipleChoiceFilter
> from django.forms import ModelMultipleChoiceField as BaseMultipleChoiceField
> +from django.forms.widgets import HiddenInput
> from django.forms.widgets import MultipleHiddenInput
>
> from patchwork.models import Bundle
> @@ -35,6 +37,7 @@ from patchwork.models import Person
> from patchwork.models import Project
> from patchwork.models import Series
> from patchwork.models import State
> +from patchwork.models import SubmissionTag
>
>
> # custom fields, filters
> @@ -136,6 +139,60 @@ class StateFilter(ModelMultipleChoiceFilter):
> field_class = StateChoiceField
>
>
> +class TagNameFilter(Filter):
> +
> + def filter(self, qs, tag_name):
> + submissions_series = SubmissionTag.objects.filter(
> + tag__name__iexact=tag_name
> + ).values_list('submission__id', 'series')
> +
> + submission_list = []
> + series_list = []
> + for submission, series in submissions_series:
> + submission_list.append(submission)
> + series_list.append(series)
> +
> + return qs.filter(Q(id__in=submission_list) | Q(series__in=series_list))
> +
> +
> +class TagValueFilter(Filter):
> +
> + def filter(self, qs, tag_value):
> + submissions_series = SubmissionTag.objects.filter(
> + value__icontains=tag_value
> + ).values_list('submission__id', 'series')
> +
> + submission_list = []
> + series_list = []
> + for submission, series in submissions_series:
> + submission_list.append(submission)
> + series_list.append(series)
> +
> + return qs.filter(Q(id__in=submission_list) | Q(series__in=series_list))
> +
> +
> +class TagFilter(Filter):
> +
> + def filter(self, qs, query):
> + try:
> + tag_name, tag_value = query.split(',', 1)
> + except ValueError:
> + raise ValidationError('Query in format `tag=name,value` expected!')
> +
> + submissions_series = SubmissionTag.objects.filter(
> + tag__name__iexact=tag_name,
> + value__icontains=tag_value
> + ).values_list('submission__id', 'series')
> +
> + submission_list = []
> + series_list = []
> + for submission, series in submissions_series:
> + submission_list.append(submission)
> + series_list.append(series)
> +
> + return qs.filter(Q(id__in=submission_list) | Q(series__in=series_list))
> +
> +
The code for this _looks_ fine, but I do wonder if it's necessary?
Would it be possible to support basic wildcarding in the filter
instead? Something like this:
?tag=*: stephen at that.guru
(but with HTTP encoding). We could also make this wildcarded by
default, e.g.
?tag=Reviewed-by:
Would return all results for this tag.
> class UserChoiceField(ModelMultipleChoiceField):
>
> alternate_lookup = 'username__iexact'
> @@ -173,10 +230,14 @@ class CoverLetterFilterSet(TimestampMixin, FilterSet):
> series = BaseFilter(queryset=Project.objects.all(),
> widget=MultipleHiddenInput)
> submitter = PersonFilter(queryset=Person.objects.all())
> + tagname = TagNameFilter(label='Tag name')
> + tagvalue = TagValueFilter(widget=HiddenInput)
> + tag = TagFilter(widget=HiddenInput)
>
> class Meta:
> model = CoverLetter
> - fields = ('project', 'series', 'submitter')
> + fields = ('project', 'series', 'submitter', 'tagname', 'tagvalue',
> + 'tag')
>
>
> class PatchFilterSet(TimestampMixin, FilterSet):
> @@ -189,11 +250,14 @@ class PatchFilterSet(TimestampMixin, FilterSet):
> submitter = PersonFilter(queryset=Person.objects.all())
> delegate = UserFilter(queryset=User.objects.all())
> state = StateFilter(queryset=State.objects.all())
> + tagname = TagNameFilter(label='Tag name')
> + tagvalue = TagValueFilter(widget=HiddenInput)
> + tag = TagFilter(widget=HiddenInput)
>
> class Meta:
> model = Patch
> fields = ('project', 'series', 'submitter', 'delegate',
> - 'state', 'archived')
> + 'state', 'archived', 'tagname', 'tagvalue', 'tag')
>
>
> class CheckFilterSet(TimestampMixin, FilterSet):
> diff --git a/patchwork/api/patch.py b/patchwork/api/patch.py
> index 9d890eb..cf97c40 100644
> --- a/patchwork/api/patch.py
> +++ b/patchwork/api/patch.py
> @@ -19,6 +19,7 @@
>
> import email.parser
>
> +from django.db.models import Q
> from django.utils.translation import ugettext_lazy as _
> from rest_framework.generics import ListAPIView
> from rest_framework.generics import RetrieveUpdateAPIView
> @@ -35,6 +36,7 @@ from patchwork.api.embedded import SeriesSerializer
> from patchwork.api.embedded import UserSerializer
> from patchwork.models import Patch
> from patchwork.models import State
> +from patchwork.models import SubmissionTag
> from patchwork.parser import clean_subject
>
>
> @@ -109,9 +111,18 @@ class PatchListSerializer(BaseHyperlinkedModelSerializer):
> reverse('api-check-list', kwargs={'patch_id': instance.id}))
>
> def get_tags(self, instance):
> - # TODO(stephenfin): Make tags performant, possibly by reworking the
> - # model
> - return {}
> + if not instance.project.use_tags:
> + return {}
> +
> + tags = SubmissionTag.objects.filter(
> + Q(submission=instance) | Q(series__in=instance.series.all())
> + ).values_list('tag__name', 'value').distinct()
> +
> + result = {}
> + for name, value in tags:
> + result.setdefault(name, []).append(value)
> +
> + return result
>
> class Meta:
> model = Patch
> @@ -160,6 +171,7 @@ class PatchDetailSerializer(PatchListSerializer):
> 'headers', 'content', 'diff', 'prefixes')
> versioned_fields = PatchListSerializer.Meta.versioned_fields
> extra_kwargs = PatchListSerializer.Meta.extra_kwargs
> + versioned_fields = PatchListSerializer.Meta.versioned_fields
>
>
> class PatchList(ListAPIView):
This is missing the entry in the versioned_fields setting. Also, same
comment around 'get_tags'.
> diff --git a/patchwork/models.py b/patchwork/models.py
> index 6268f5b..ba53278 100644
> --- a/patchwork/models.py
> +++ b/patchwork/models.py
> @@ -20,8 +20,6 @@
>
> from __future__ import absolute_import
>
> -from collections import Counter
> -from collections import OrderedDict
> import datetime
> import random
> import re
> @@ -31,6 +29,7 @@ from django.conf import settings
> from django.contrib.auth.models import User
> from django.core.exceptions import ValidationError
> from django.db import models
> +from django.db.models import Q
> from django.utils.encoding import python_2_unicode_compatible
> from django.utils.functional import cached_property
>
> @@ -252,10 +251,6 @@ class Tag(models.Model):
> ' tag\'s count in the patch list view',
> default=True)
>
> - @property
> - def attr_name(self):
> - return 'tag_%d_count' % self.id
> -
> def __str__(self):
> return self.name
>
> @@ -263,64 +258,21 @@ class Tag(models.Model):
> ordering = ['abbrev']
>
>
> -class PatchTag(models.Model):
> - patch = models.ForeignKey('Patch', on_delete=models.CASCADE)
> +class SubmissionTag(models.Model):
> + submission = models.ForeignKey('Submission', on_delete=models.CASCADE)
> tag = models.ForeignKey('Tag', on_delete=models.CASCADE)
> - count = models.IntegerField(default=1)
> + value = models.CharField(max_length=255)
> + comment = models.ForeignKey('Comment', null=True, on_delete=models.CASCADE)
> + series = models.ForeignKey('Series', null=True, on_delete=models.CASCADE)
>
> class Meta:
> - unique_together = [('patch', 'tag')]
> + unique_together = [('submission', 'tag', 'value', 'comment')]
>
>
> def get_default_initial_patch_state():
> return State.objects.get(ordering=0)
>
>
> -class PatchQuerySet(models.query.QuerySet):
> -
> - def with_tag_counts(self, project=None):
> - if project and not project.use_tags:
> - return self
> -
> - # We need the project's use_tags field loaded for Project.tags().
> - # Using prefetch_related means we'll share the one instance of
> - # Project, and share the project.tags cache between all patch.project
> - # references.
> - qs = self.prefetch_related('project')
> - select = OrderedDict()
> - select_params = []
> -
> - # All projects have the same tags, so we're good to go here
> - if project:
> - tags = project.tags
> - else:
> - tags = Tag.objects.all()
> -
> - for tag in tags:
> - select[tag.attr_name] = (
> - "coalesce("
> - "(SELECT count FROM patchwork_patchtag"
> - " WHERE patchwork_patchtag.patch_id="
> - "patchwork_patch.submission_ptr_id"
> - " AND patchwork_patchtag.tag_id=%s), 0)")
> - select_params.append(tag.id)
> -
> - return qs.extra(select=select, select_params=select_params)
> -
> -
> -class PatchManager(models.Manager):
> - use_for_related_fields = True
> - # NOTE(stephenfin): This is necessary to silence a warning with Django >=
> - # 1.10. Remove when 1.10 is the minimum supported version.
> - silence_use_for_related_fields_deprecation = True
> -
> - def get_queryset(self):
> - return PatchQuerySet(self.model, using=self.db)
> -
> - def with_tag_counts(self, project):
> - return self.get_queryset().with_tag_counts(project)
> -
> -
> class EmailMixin(models.Model):
> """Mixin for models with an email-origin."""
> # email metadata
> @@ -334,17 +286,16 @@ class EmailMixin(models.Model):
> submitter = models.ForeignKey(Person, on_delete=models.CASCADE)
> content = models.TextField(null=True, blank=True)
>
> - response_re = re.compile(
> - r'^(Tested|Reviewed|Acked|Signed-off|Nacked|Reported)-by:.*$',
> - re.M | re.I)
> + def _extract_tags(self, tags):
> + found_tags = {}
>
> - @property
> - def patch_responses(self):
> if not self.content:
> - return ''
> + return found_tags
>
> - return ''.join([match.group(0) + '\n' for match in
> - self.response_re.finditer(self.content)])
> + for tag in tags:
> + regex = re.compile(tag.pattern + r'\s*(.*)', re.M | re.I | re.U)
> + found_tags[tag] = regex.findall(self.content)
> + return found_tags
I wonder if we can move this to 'parser.py' now? It doesn't sound like
something we'll be doing at runtime so we could move it out of here
now.
>
> def save(self, *args, **kwargs):
> # Modifying a submission via admin interface changes '\n' newlines in
> @@ -377,6 +328,40 @@ class Submission(FilenameMixin, EmailMixin, models.Model):
> # submission metadata
>
> name = models.CharField(max_length=255)
> + related_tags = models.ManyToManyField(Tag, through=SubmissionTag)
Just 'tags'?
> +
> + def add_tags(self, tag, values, comment=None):
> + if hasattr(self, 'patch'):
> + series = None
> + else:
> + series = self.coverletter.series.first()
> +
> + current_objs = SubmissionTag.objects.filter(submission=self,
> + comment=comment,
> + tag=tag,
> + series=series)
> +
> + if not values:
> + current_objs.delete()
> + return
> +
> + # In case the origin is modified, delete tags that were removed
> + current_objs.exclude(value__in=values).delete()
> +
> + values_to_add = set(values) - set(current_objs.values_list('value',
> + flat=True))
> + SubmissionTag.objects.bulk_create([SubmissionTag(
> + submission=self,
> + tag=tag,
> + value=value,
> + comment=comment,
> + series=series
> + ) for value in values_to_add])
> +
> + def refresh_tags(self):
> + submission_tags = self._extract_tags(Tag.objects.all())
> + for tag in submission_tags:
> + self.add_tags(tag, submission_tags[tag])
Similar comments around moving these functions out of here and into
'parser.py'. I'd _really_ like to have all this stuff in 'parser.py' or
'retag.py'.
>
> # patchwork metadata
>
> @@ -426,7 +411,6 @@ class Patch(SeriesMixin, Submission):
> diff = models.TextField(null=True, blank=True)
> commit_ref = models.CharField(max_length=255, null=True, blank=True)
> pull_url = models.CharField(max_length=255, null=True, blank=True)
> - tags = models.ManyToManyField(Tag, through=PatchTag)
>
> # patchwork metadata
>
> @@ -440,40 +424,6 @@ class Patch(SeriesMixin, Submission):
> # patches in a project without needing to do a JOIN.
> patch_project = models.ForeignKey(Project, on_delete=models.CASCADE)
>
> - objects = PatchManager()
> -
> - @staticmethod
> - def extract_tags(content, tags):
> - counts = Counter()
> -
> - for tag in tags:
> - regex = re.compile(tag.pattern, re.MULTILINE | re.IGNORECASE)
> - counts[tag] = len(regex.findall(content))
> -
> - return counts
> -
> - def _set_tag(self, tag, count):
> - if count == 0:
> - self.patchtag_set.filter(tag=tag).delete()
> - return
> - patchtag, _ = PatchTag.objects.get_or_create(patch=self, tag=tag)
> - if patchtag.count != count:
> - patchtag.count = count
> - patchtag.save()
> -
> - def refresh_tag_counts(self):
> - tags = self.project.tags
> - counter = Counter()
> -
> - if self.content:
> - counter += self.extract_tags(self.content, tags)
> -
> - for comment in self.comments.all():
> - counter = counter + self.extract_tags(comment.content, tags)
> -
> - for tag in tags:
> - self._set_tag(tag, counter[tag])
> -
> def save(self, *args, **kwargs):
> if not hasattr(self, 'state') or not self.state:
> self.state = get_default_initial_patch_state()
> @@ -482,8 +432,7 @@ class Patch(SeriesMixin, Submission):
> self.hash = hash_diff(self.diff)
>
> super(Patch, self).save(**kwargs)
> -
> - self.refresh_tag_counts()
> + self.refresh_tags()
>
> def is_editable(self, user):
> if not is_authenticated(user):
> @@ -495,6 +444,18 @@ class Patch(SeriesMixin, Submission):
> return self.project.is_editable(user)
>
> @property
> + def all_tags(self):
> + related_tags = SubmissionTag.objects.filter(
> + Q(submission=self) | Q(series__in=self.series.all())
> + ).values_list('tag__name', 'value').distinct()
> +
> + patch_tags = {}
> + for tag in self.project.tags:
> + patch_tags[tag] = [related_tag[1] for related_tag in related_tags
> + if related_tag[0] == tag.name]
Hmm, I don't think you want to be doing a list comprehension here as
Python is way slower than the DB. You could do:
related_tags = related_tags.filter(name__in=self.project.tags)
...or similar.
> + return patch_tags
Yeah, we should probably be using this in the API. Not sure about
performance though. This could need some work.
> +
> + @property
> def combined_check_state(self):
> """Return the combined state for all checks.
>
> @@ -611,13 +572,12 @@ class Comment(EmailMixin, models.Model):
>
> def save(self, *args, **kwargs):
> super(Comment, self).save(*args, **kwargs)
> - if hasattr(self.submission, 'patch'):
> - self.submission.patch.refresh_tag_counts()
> + self.refresh_tags()
I'd _personally_ rather we did this in 'parser.py'. There's no reason
to calculate this stuff everytime we change anything about the patch
(e.g. updating the state).
> - def delete(self, *args, **kwargs):
> - super(Comment, self).delete(*args, **kwargs)
> - if hasattr(self.submission, 'patch'):
> - self.submission.patch.refresh_tag_counts()
> + def refresh_tags(self):
> + comment_tags = self._extract_tags(Tag.objects.all())
> + for tag in comment_tags:
> + self.submission.add_tags(tag, comment_tags[tag], comment=self)
>
> def is_editable(self, user):
> return False
> @@ -716,6 +676,7 @@ class Series(FilenameMixin, models.Model):
> self.name = self._format_name(cover)
>
> self.save()
> + cover.refresh_tags()
>
> def add_patch(self, patch, number):
> """Add a patch to the series."""
> diff --git a/patchwork/templatetags/patch.py b/patchwork/templatetags/patch.py
> index 4350e09..f226dc0 100644
> --- a/patchwork/templatetags/patch.py
> +++ b/patchwork/templatetags/patch.py
> @@ -34,8 +34,9 @@ register = template.Library()
> def patch_tags(patch):
> counts = []
> titles = []
> + all_tags = patch.all_tags
> for tag in [t for t in patch.project.tags if t.show_column]:
> - count = getattr(patch, tag.attr_name)
> + count = len(all_tags[tag])
> titles.append('%d %s' % (count, tag.name))
> if count == 0:
> counts.append("-")
> diff --git a/patchwork/views/__init__.py b/patchwork/views/__init__.py
> index f8d23a3..8c41df6 100644
> --- a/patchwork/views/__init__.py
> +++ b/patchwork/views/__init__.py
> @@ -272,9 +272,6 @@ def generic_list(request, project, view, view_args=None, filter_settings=None,
> if patches is None:
> patches = Patch.objects.filter(patch_project=project)
>
> - # annotate with tag counts
> - patches = patches.with_tag_counts(project)
> -
> patches = context['filters'].apply(patches)
> if not editable_order:
> patches = order.apply(patches)
> diff --git a/patchwork/views/utils.py b/patchwork/views/utils.py
> index 2357ab8..f1f78c8 100644
> --- a/patchwork/views/utils.py
> +++ b/patchwork/views/utils.py
> @@ -27,11 +27,13 @@ import email.utils
> import re
>
> from django.conf import settings
> +from django.db.models import Q
> from django.http import Http404
> from django.utils import six
>
> from patchwork.models import Comment
> from patchwork.models import Patch
> +from patchwork.models import SubmissionTag
> from patchwork.models import Series
>
> if settings.ENABLE_REST_API:
> @@ -75,9 +77,11 @@ def _submission_to_mbox(submission):
> else:
> postscript = ''
>
> - # TODO(stephenfin): Make this use the tags infrastructure
> - for comment in Comment.objects.filter(submission=submission):
> - body += comment.patch_responses
> + for (tagname, value) in SubmissionTag.objects.filter(
> + Q(submission=submission)
> + | Q(series__in=submission.series.all())).values_list(
> + 'tag__name', 'value').distinct():
> + body += '%s: %s\n' % (tagname, value)
>
> if postscript:
> body += '---\n' + postscript + '\n'
More information about the Patchwork
mailing list