MAJ terminée. Nous sommes passés en version 14.6.2 . Pour consulter les "releases notes" associées c'est ici :

https://about.gitlab.com/releases/2022/01/11/security-release-gitlab-14-6-2-released/
https://about.gitlab.com/releases/2022/01/04/gitlab-14-6-1-released/

Commit 22fa410e authored by Robin Tissot's avatar Robin Tissot
Browse files

IIIF import, split png conversion and compression, and code cleanup.

parent aeb395d1
......@@ -93,17 +93,9 @@ class LineTranscriptionSerializer(serializers.ModelSerializer):
def cleanup(self, data):
return bleach.clean(data, tags=['em', 'strong', 's', 'u'], strip=True)
def create(self, validated_data):
validated_data['content'] = self.cleanup(validated_data['content'])
instance = super().create(validated_data)
instance.line.document_part.recalculate_ordering()
return instance
def update(self, instance, validated_data):
validated_data['content'] = self.cleanup(validated_data['content'])
instance.line.document_part.recalculate_ordering()
return super().update(instance, validated_data)
def validate_content(self, mode):
return self.cleanup(self.initial_data.get('content'))
class LineSerializer(serializers.ModelSerializer):
......@@ -113,6 +105,15 @@ class LineSerializer(serializers.ModelSerializer):
model = Line
fields = ('pk', 'order', 'document_part', 'block', 'box', 'transcriptions')
def create(self, validated_data):
instance = super().create(validated_data)
instance.document_part.recalculate_ordering()
return instance
def update(self, instance, validated_data):
instance.document_part.recalculate_ordering()
return super().update(instance, validated_data)
class PartDetailSerializer(PartSerializer):
blocks = BlockSerializer(many=True)
......
......@@ -73,7 +73,7 @@ class PartViewSetTestCase(CoreFactoryTestCase):
self.client.force_login(self.user)
uri = reverse('api:part-list',
kwargs={'document_pk': self.part.document.pk})
with self.assertNumQueries(12):
with self.assertNumQueries(17):
img = self.factory.make_image_file()
resp = self.client.post(uri, {
'image': SimpleUploadedFile(
......@@ -170,12 +170,12 @@ class LineViewSetTestCase(CoreFactoryTestCase):
document_part=self.part)
for i in range(2):
l = Line.objects.create(
box=[10+50*i,10,50+50*i,50],
box=[10+50*i, 10, 50+50*i, 50],
document_part=self.part,
block=self.block)
self.line = l
self.orphan = Line.objects.create(
box=[0,0,10,10],
box=[0, 0, 10, 10],
document_part=self.part,
block=None)
......@@ -188,10 +188,10 @@ class LineViewSetTestCase(CoreFactoryTestCase):
uri = reverse('api:line-list',
kwargs={'document_pk': self.part.document.pk,
'part_pk': self.part.pk})
with self.assertNumQueries(6):
with self.assertNumQueries(13):
resp = self.client.post(uri, {
'document_part': self.part.pk,
'box': [10,10,50,50]
'box': '[10, 10, 50, 50]'
})
self.assertEqual(resp.status_code, 201)
......@@ -233,7 +233,7 @@ class LineTranscriptionViewSetTestCase(CoreFactoryTestCase):
kwargs={'document_pk': self.part.document.pk,
'part_pk': self.part.pk,
'pk': self.lt.pk})
with self.assertNumQueries(8):
with self.assertNumQueries(5):
resp = self.client.patch(uri, {
'content': 'update'
}, content_type='application/json')
......@@ -245,7 +245,7 @@ class LineTranscriptionViewSetTestCase(CoreFactoryTestCase):
kwargs={'document_pk': self.part.document.pk,
'part_pk': self.part.pk})
with self.assertNumQueries(10):
with self.assertNumQueries(6):
resp = self.client.post(uri, {
'line': self.line2.pk,
'transcription': self.transcription.pk,
......
......@@ -150,12 +150,13 @@ class DocumentProcessForm(BootstrapFormMixin, forms.ModelForm):
self.parts[0].save()
else:
for part in self.parts:
part.task_binarize(user_pk=self.user.pk)
part.task('binarize', user_pk=self.user.pk)
elif task == self.TASK_SEGMENT:
for part in self.parts:
part.task_segment(user_pk=self.user.pk,
steps=self.cleaned_data['segmentation_steps'],
text_direction=self.cleaned_data['text_direction'])
part.task('segment',
user_pk=self.user.pk,
steps=self.cleaned_data['segmentation_steps'],
text_direction=self.cleaned_data['text_direction'])
elif task == self.TASK_TRAIN:
if self.cleaned_data.get('new_model'):
# create model and corresponding OcrModel
......@@ -175,7 +176,7 @@ class DocumentProcessForm(BootstrapFormMixin, forms.ModelForm):
model = None
for part in self.parts:
part.task_transcribe(user_pk=self.user.pk, model=model)
part.task('transcribe', user_pk=self.user.pk, model=model)
self.save() # save settings
......
# Generated by Django 2.1.4 on 2019-04-10 13:18
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0011_auto_20190317_1550'),
]
operations = [
migrations.AddField(
model_name='documentpart',
name='source',
field=models.CharField(blank=True, max_length=1024),
),
]
# Generated by Django 2.1.4 on 2019-04-10 13:53
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0012_documentpart_source'),
]
operations = [
migrations.AlterField(
model_name='documentpart',
name='workflow_state',
field=models.PositiveSmallIntegerField(choices=[(0, 'Created'), (1, 'Converting'), (2, 'Converted'), (3, 'Binarizing'), (4, 'Binarized'), (5, 'Segmenting'), (6, 'Segmented'), (7, 'Transcribing')], default=0),
),
]
......@@ -21,7 +21,7 @@ from django.db.models.signals import pre_delete
from celery.result import AsyncResult
from celery.task.control import inspect
from celery import chain
from celery import chain, group
from easy_thumbnails.files import get_thumbnailer, generate_all_aliases
from ordered_model.models import OrderedModel
......@@ -174,6 +174,7 @@ class DocumentPart(OrderedModel):
"""
name = models.CharField(max_length=512, blank=True)
image = models.ImageField(upload_to=document_images_path)
source = models.CharField(max_length=1024, blank=True)
bw_backend = models.CharField(max_length=128, default='kraken')
bw_image = models.ImageField(upload_to=document_images_path,
null=True, blank=True,
......@@ -185,8 +186,8 @@ class DocumentPart(OrderedModel):
order_with_respect_to = 'document'
WORKFLOW_STATE_CREATED = 0
WORKFLOW_STATE_COMPRESSING = 1
WORKFLOW_STATE_COMPRESSED = 2
WORKFLOW_STATE_CONVERTING = 1
WORKFLOW_STATE_CONVERTED = 2
WORKFLOW_STATE_BINARIZING = 3
WORKFLOW_STATE_BINARIZED = 4
WORKFLOW_STATE_SEGMENTING = 5
......@@ -194,8 +195,8 @@ class DocumentPart(OrderedModel):
WORKFLOW_STATE_TRANSCRIBING = 7
WORKFLOW_STATE_CHOICES = (
(WORKFLOW_STATE_CREATED, _("Created")),
(WORKFLOW_STATE_COMPRESSING, _("Compressing")),
(WORKFLOW_STATE_COMPRESSED, _("Compressed")),
(WORKFLOW_STATE_CONVERTING, _("Converting")),
(WORKFLOW_STATE_CONVERTED, _("Converted")),
(WORKFLOW_STATE_BINARIZING, _("Binarizing")),
(WORKFLOW_STATE_BINARIZED, _("Binarized")),
(WORKFLOW_STATE_SEGMENTING, _("Segmenting")),
......@@ -222,8 +223,8 @@ class DocumentPart(OrderedModel):
return str(self)
@property
def compressed(self):
return self.workflow_state >= self.WORKFLOW_STATE_COMPRESSED
def converted(self):
return self.workflow_state >= self.WORKFLOW_STATE_CONVERTED
@property
def binarized(self):
......@@ -280,9 +281,10 @@ class DocumentPart(OrderedModel):
return 0
# fetch all lines and regroup them by block
qs = self.lines.select_related('block').all()
ls = [(l, (origin_pt(l.block.box), origin_pt(l.box))
if l.block else (origin_pt(l.box), origin_pt(l.box)))
for l in self.lines.all()]
for l in qs]
# sort depending on the distance to the origin
ls.sort(key=functools.cmp_to_key(lambda a,b: cmp_pts(a[1], b[1])))
......@@ -296,7 +298,7 @@ class DocumentPart(OrderedModel):
new = self.pk is None
instance = super().save(*args, **kwargs)
if new:
self.task_compress()
self.task('convert')
send_event('document', self.document.pk, "part:new", {"id": self.pk})
else:
self.calculate_progress()
......@@ -364,8 +366,8 @@ class DocumentPart(OrderedModel):
queued = ([task['id'] for queue in i.scheduled().values() for task in queue] +
[task['id'] for queue in i.active().values() for task in queue] +
[task['id'] for queue in i.reserved().values() for task in queue])
if self.workflow_state == self.WORKFLOW_STATE_COMPRESSING:
task_name = 'core.tasks.lossless_compression'
if self.workflow_state == self.WORKFLOW_STATE_CONVERTING:
task_name = 'core.tasks.convert'
if (not task_name in self.tasks or
self.tasks[task_name]['task_id'] not in queued):
self.workflow_state = self.WORFLOW_STATE_CREATED
......@@ -374,7 +376,7 @@ class DocumentPart(OrderedModel):
task_name = 'core.tasks.binarize'
if (not task_name in self.tasks or
self.tasks[task_name]['task_id'] not in queued):
self.workflow_state = self.WORFLOW_STATE_COMPRESSED
self.workflow_state = self.WORFLOW_STATE_CONVERTED
redis_.set('process-%d' % self.pk, json.dumps({task_name: {"status": "error"}}))
elif self.workflow_state == self.WORKFLOW_STATE_SEGMENTING:
task_name = 'core.tasks.segment'
......@@ -393,37 +395,38 @@ class DocumentPart(OrderedModel):
if settings.THUMBNAIL_ENABLE:
generate_all_aliases(self.image, include_global=True)
def compress(self):
if self.workflow_state < self.WORKFLOW_STATE_COMPRESSING:
self.workflow_state = self.WORKFLOW_STATE_COMPRESSING
def convert(self):
if self.workflow_state < self.WORKFLOW_STATE_CONVERTING:
self.workflow_state = self.WORKFLOW_STATE_CONVERTING
self.save()
convert = False
old_name = self.image.file.name
filename, extension = os.path.splitext(old_name)
if extension != ".png":
convert = True
new_name = filename + ".png"
error = subprocess.check_call(["convert", old_name, new_name])
if error:
raise RuntimeError("Error trying to convert file(%s) to png.")
else:
new_name = old_name
self.image = new_name.split(settings.MEDIA_ROOT)[1][1:]
os.remove(old_name)
if self.workflow_state < self.WORKFLOW_STATE_CONVERTED:
self.workflow_state = self.WORKFLOW_STATE_CONVERTED
self.save()
def compress(self):
filename, extension = os.path.splitext(self.image.file.name)
opti_name = filename + '_opti.png'
try:
subprocess.check_call(["pngcrush", "-q", new_name, opti_name])
subprocess.check_call(["pngcrush", "-q", self.image.file.name, opti_name])
except Exception as e:
# Note: let it fail it's fine
logger.exception("png optimization failed.")
os.rename(opti_name, new_name)
if convert:
self.image = new_name.split(settings.MEDIA_ROOT)[1][1:]
os.remove(old_name)
if self.workflow_state < self.WORKFLOW_STATE_COMPRESSED:
self.workflow_state = self.WORKFLOW_STATE_COMPRESSED
self.save()
logger.exception("png optimization failed for %s." % filename)
if DEBUG:
raise e
else:
os.rename(opti_name, self.image.file.name)
def binarize(self):
if self.workflow_state < self.WORKFLOW_STATE_BINARIZING:
......@@ -529,55 +532,31 @@ class DocumentPart(OrderedModel):
redis_.set('process-%d' % self.pk, json.dumps({tasks[-1].name: {"status": "pending"}}))
chain(*tasks).delay()
def task_compress(self):
def task(self, task_name, **kwargs):
if not self.tasks_finished():
raise AlreadyProcessingException
tasks = []
tasks.append(lossless_compression.si(self.pk))
if settings.THUMBNAIL_ENABLE:
tasks.append(generate_part_thumbnails.si(self.pk))
self.chain_tasks(*tasks)
def task_binarize(self, user_pk=None, binarizer=None):
if not self.tasks_finished():
raise AlreadyProcessingException
tasks = []
if not self.compressed:
tasks.append(lossless_compression.si(self.pk))
tasks_order = ['convert', 'binarize', 'segment', 'transcribe']
if task_name == 'convert' or self.workflow_state < self.WORKFLOW_STATE_CONVERTED:
sig = convert.si(self.pk)
sig.link(lossless_compression.si(self.pk))
if settings.THUMBNAIL_ENABLE:
tasks.append(generate_part_thumbnails.si(selfelf.pk))
tasks.append(binarize.si(self.pk, user_pk=user_pk, binarizer=binarizer))
self.chain_tasks(*tasks)
def task_segment(self, user_pk=None, steps=None, text_direction=None):
if not self.tasks_finished():
raise AlreadyProcessingException
sig.link(generate_part_thumbnails.si(self.pk))
tasks.append(sig)
tasks = []
if not self.compressed:
tasks.append(lossless_compression.si(self.pk))
if settings.THUMBNAIL_ENABLE:
tasks.append(generate_part_thumbnails.si(self.pk))
if not self.binarized:
tasks.append(binarize.si(self.pk, user_pk=user_pk))
tasks.append(segment.si(self.pk, user_pk=user_pk, steps=steps, text_direction=text_direction))
self.chain_tasks(*tasks)
def task_transcribe(self, user_pk=None, model=None):
if not self.tasks_finished():
raise AlreadyProcessingException
if (task_name == 'binarize'
or (tasks_order.index(task_name) > tasks_order.index('binarize')
and not self.binarized)):
tasks.append(binarize.si(self.pk, **kwargs))
if (task_name == 'segment'
or (tasks_order.index(task_name) > tasks_order.index('segment')
and not self.segmented)):
tasks.append(segment.si(self.pk, **kwargs))
if task_name == 'transcribe':
tasks.append(transcribe.si(self.pk, **kwargs))
tasks = []
if not self.compressed:
tasks.append(lossless_compression.si(self.pk))
if settings.THUMBNAIL_ENABLE:
tasks.append(generate_part_thumbnails.si(self.pk))
if not self.binarized:
tasks.append(binarize.si(self.pk, user_pk=user_pk))
if not self.segmented:
tasks.append(segment.si(self.pk, user_pk=user_pk))
tasks.append(transcribe.si(self.pk, user_pk=user_pk, model_pk=model and model.pk or None))
self.chain_tasks(*tasks)
......
......@@ -324,9 +324,10 @@ $(document).ready(function() {
});
$('#alerts-container').on('import:progress', function(ev, data) {
$('#import-counter').parent().css({opacity: 100});
if (data.progress) {
$('#import-counter').text(data.progress+"/"+data.total);
}
}
});
$('#alerts-container').on('import:fail', function(ev, data) {
......
......@@ -49,8 +49,19 @@ def generate_part_thumbnails(instance_pk):
part.generate_thumbnails()
@shared_task(bind=True)
def lossless_compression(self, instance_pk):
@shared_task
def convert(instance_pk, **kwargs):
try:
DocumentPart = apps.get_model('core', 'DocumentPart')
part = DocumentPart.objects.get(pk=instance_pk)
except DocumentPart.DoesNotExist:
logger.error('Trying to convert innexistant DocumentPart : %d', instance_pk)
raise
part.convert()
@shared_task
def lossless_compression(instance_pk, **kwargs):
try:
DocumentPart = apps.get_model('core', 'DocumentPart')
part = DocumentPart.objects.get(pk=instance_pk)
......@@ -61,7 +72,7 @@ def lossless_compression(self, instance_pk):
@shared_task
def binarize(instance_pk, user_pk=None, binarizer=None):
def binarize(instance_pk, user_pk=None, binarizer=None, **kwargs):
try:
DocumentPart = apps.get_model('core', 'DocumentPart')
part = DocumentPart.objects.get(pk=instance_pk)
......@@ -93,7 +104,7 @@ def binarize(instance_pk, user_pk=None, binarizer=None):
@shared_task
def segment(instance_pk, user_pk=None, steps=None, text_direction=None):
def segment(instance_pk, user_pk=None, steps=None, text_direction=None, **kwargs):
"""
steps can be either 'regions', 'lines' or 'both'
"""
......@@ -128,12 +139,12 @@ def segment(instance_pk, user_pk=None, steps=None, text_direction=None):
@shared_task
def train(model, pks, user_pk=None):
def train(model, pks, user_pk=None, **kwargs):
pass
@shared_task
def transcribe(instance_pk, model_pk=None, user_pk=None, text_direction=None):
def transcribe(instance_pk, model_pk=None, user_pk=None, text_direction=None, **kwargs):
try:
DocumentPart = apps.get_model('core', 'DocumentPart')
part = DocumentPart.objects.get(pk=instance_pk)
......@@ -177,6 +188,8 @@ def transcribe(instance_pk, model_pk=None, user_pk=None, text_direction=None):
@before_task_publish.connect
def before_publish_state(sender=None, body=None, **kwargs):
if not sender.startswith('core.tasks'):
return
instance_id = body[0][0]
data = json.loads(redis_.get('process-%d' % instance_id) or '{}')
......@@ -186,16 +199,15 @@ def before_publish_state(sender=None, body=None, **kwargs):
}
redis_.set('process-%d' % instance_id, json.dumps(data))
# Note: only this part of the signal is filtered
# against this module's tasks, which isn't great
if sender.name.startswith('core.tasks'):
update_client_state(instance_id, sender, 'pending')
update_client_state(instance_id, sender, 'pending')
@task_prerun.connect
@task_success.connect
@task_failure.connect
def done_state(sender=None, body=None, **kwargs):
if not sender.name.startswith('core.tasks'):
return
instance_id = sender.request.args[0]
data = json.loads(redis_.get('process-%d' % instance_id) or '{}')
......@@ -214,5 +226,4 @@ def done_state(sender=None, body=None, **kwargs):
data = {k:v for k,v in data.items() if v['status'] != 'pending'}
redis_.set('process-%d' % instance_id, json.dumps(data))
if sender.name.startswith('core.tasks'):
update_client_state(instance_id, sender.name, status)
update_client_state(instance_id, sender.name, status)
......@@ -17,10 +17,10 @@ class TasksTestCase(CoreFactoryTestCase):
def test_workflow(self):
self.assertEqual(self.part.workflow_state,
self.part.WORKFLOW_STATE_CREATED)
self.part.compress()
self.part.convert()
self.assertEqual(self.part.workflow_state,
self.part.WORKFLOW_STATE_COMPRESSED)
self.part.WORKFLOW_STATE_CONVERTED)
self.part.compress()
self.part.binarize()
self.assertEqual(self.part.workflow_state,
self.part.WORKFLOW_STATE_BINARIZED)
......@@ -35,15 +35,14 @@ class TasksTestCase(CoreFactoryTestCase):
self.part.transcribe()
self.assertEqual(self.part.workflow_state,
self.part.WORKFLOW_STATE_TRANSCRIBING)
def test_post(self):
def test_process_transcribe(self):
self.client.force_login(self.part.document.owner)
uri = reverse('document-parts-process', kwargs={
'pk': self.part.document.pk})
parts = self.part.document.parts.all()
for part in parts:
redis_.get('process-%d' % part.pk)
part.workflow_state = part.WORKFLOW_STATE_COMPRESSED
part.workflow_state = part.WORKFLOW_STATE_CONVERTED
part.save()
response = self.client.post(uri, {
......
import json
import requests
from django import forms
from django.core.validators import FileExtensionValidator
from django.core.files.base import ContentFile
from django.core.files.uploadedfile import SimpleUploadedFile
from django.utils.translation import gettext as _
from django.utils.functional import cached_property
from bootstrap.forms import BootstrapFormMixin
from imports.models import Import
from imports.parsers import make_parser, ParseError
from imports.tasks import xml_import
from imports.tasks import document_import
class ImportForm(BootstrapFormMixin, forms.Form):
......@@ -16,6 +19,10 @@ class ImportForm(BootstrapFormMixin, forms.Form):
xml_file = forms.FileField(
required=False,
help_text=_("Alto or Abbyy XML."))
iiif_uri = forms.URLField(
required=False,
label=_("iiif manifesto"),
help_text=_("exp: https://gallica.bnf.fr/iiif/ark:/12148/btv1b8610793w/manifest.json"))
resume_import = forms.BooleanField(
required=False,
label=_("Resume previous import"),
......@@ -27,10 +34,18 @@ class ImportForm(BootstrapFormMixin, forms.Form):
self.current_import = self.document.import_set.order_by('started_on').last()
super().__init__(*args, **kwargs)
def clean_xml_file(self):
tmpfile = self.cleaned_data.get('xml_file')
# check its alto or abbyy
return tmpfile
# def clean_xml_file(self):
# tmpfile = self.cleaned_data.get('xml_file')
# # check its alto or abbyy
# return tmpfile
def clean_iiif_uri(self):
uri = self.cleaned_data.get('iiif_uri')
try:
if uri:
return requests.get(uri).json()
except json.decoder.JSONDecodeError:
raise ValidationError(_("The document pointed to by the given uri doesn't seem to be valid json."))
def clean_parts(self):
try:
......@@ -41,30 +56,43 @@ class ImportForm(BootstrapFormMixin, forms.Form):
def clean(self):
cleaned_data = super().clean()
if not cleaned_data["resume_import"] and not cleaned_data['xml_file']:
if (not cleaned_data["resume_import"]
and not cleaned_data['xml_file']
and not cleaned_data['iiif_uri']):
raise forms.ValidationError(_("Choose one type of import."))
if cleaned_data['xml_file']:
try:
parser = make_parser(cleaned_data['xml_file'])
except ParseError:
raise forms.ValidationError(_("Couldn't parse the given xml file."))
if parser and len(parser.pages) != len(cleaned_data['parts']):
if parser and parser.total != len(cleaned_data['parts']):
raise forms.ValidationError(
_("The number of pages in the import file doesn't match the number of selected images, respectively %d and %d." %
(len(parser.pages), len(cleaned_data['parts']))))
return cleaned_data
def save(self):
if self.cleaned_data['resume_import'] and self.current_import.failed:
self.instance = self.current_import
else:
self.instance = Import.objects.create(
imp = Import(
document = self.document,
started_by = self.user,
parts=self.cleaned_data['parts'],
import_file=self.cleaned_data['xml_file'])