Commit 1691d8e3 authored by Robin Tissot's avatar Robin Tissot
Browse files

Merge branch 'develop'

parents 6c6bdabf 70a75308
eScriptorium
Copyright (C) 2018 Robin Tissot (PSL)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
\ No newline at end of file
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
env/
static/
media/
test_media/
logs/
......@@ -110,6 +110,11 @@ class LineTranscriptionSerializer(serializers.ModelSerializer):
def validate_content(self, mode):
return self.cleanup(self.initial_data.get('content'))
def create(self, validated_data):
instance = super().create(validated_data)
instance.line.document_part.calculate_progress()
return instance
class LineSerializer(serializers.ModelSerializer):
class Meta:
......
......@@ -188,7 +188,7 @@ class LineViewSetTestCase(CoreFactoryTestCase):
uri = reverse('api:line-list',
kwargs={'document_pk': self.part.document.pk,
'part_pk': self.part.pk})
with self.assertNumQueries(12):
with self.assertNumQueries(11):
resp = self.client.post(uri, {
'document_part': self.part.pk,
'box': '[10, 10, 50, 50]'
......@@ -245,7 +245,7 @@ class LineTranscriptionViewSetTestCase(CoreFactoryTestCase):
kwargs={'document_pk': self.part.document.pk,
'part_pk': self.part.pk})
with self.assertNumQueries(6):
with self.assertNumQueries(7):
resp = self.client.post(uri, {
'line': self.line2.pk,
'transcription': self.transcription.pk,
......
......@@ -91,8 +91,8 @@ class DocumentViewSet(ModelViewSet):
else:
return Response({'error': 'Invalid format.'}, status=status.HTTP_400_BAD_REQUEST)
response['Content-Disposition'] = 'attachment; filename="export-%s-%s.%s"' % (
slugify(self.object.name), datetime.now().isoformat()[:16], extension)
response['Content-Disposition'] = 'attachment; filename="export_%s_%s.%s"' % (
slugify(self.object.name), datetime.now().isoformat()[:16].replace('-', '_'), extension)
return response
def get_part_data(self, part_pk, transcription):
......@@ -145,6 +145,7 @@ class PartViewSet(ModelViewSet):
part = DocumentPart.objects.get(document=document_pk, pk=pk)
part.cancel_tasks()
part.refresh_from_db()
del part.tasks # reset cache
return Response({'status': 'canceled', 'workflow': part.workflow})
......
......@@ -25,7 +25,6 @@ class ScriptAdmin(admin.ModelAdmin):
list_filter = ['text_direction']
admin.site.register(Document, DocumentAdmin)
admin.site.register(DocumentProcessSettings)
admin.site.register(DocumentPart, DocumentPartAdmin)
admin.site.register(LineTranscription, LineTranscriptionAdmin)
admin.site.register(Typology)
......
......@@ -7,7 +7,7 @@ from django.db import transaction
from django.db.models import Q
from django.forms.models import inlineformset_factory
from django.utils.functional import cached_property
from django.utils.translation import gettext as _
from django.utils.translation import gettext_lazy as _
from bootstrap.forms import BootstrapFormMixin
from core.models import *
......@@ -57,7 +57,7 @@ class DocumentShareForm(BootstrapFormMixin, forms.ModelForm):
if self.cleaned_data['username']:
doc.shared_with_users.add(self.cleaned_data['username'])
return doc
class MetadataForm(BootstrapFormMixin, forms.ModelForm):
key = forms.CharField()
......@@ -79,11 +79,13 @@ class MetadataForm(BootstrapFormMixin, forms.ModelForm):
return key
MetadataFormSet = inlineformset_factory(Document, DocumentMetadata, form=MetadataForm,
MetadataFormSet = inlineformset_factory(Document, DocumentMetadata,
form=MetadataForm,
extra=1, can_delete=True)
class DocumentProcessForm(BootstrapFormMixin, forms.ModelForm):
class DocumentProcessForm(BootstrapFormMixin, forms.Form):
# TODO: split this form into one for each process?!
TASK_BINARIZE = 'binarize'
TASK_SEGMENT = 'segment'
TASK_TRAIN = 'train'
......@@ -95,25 +97,44 @@ class DocumentProcessForm(BootstrapFormMixin, forms.ModelForm):
(TASK_TRANSCRIBE, 4),
))
parts = forms.CharField()
# binarization
bw_image = forms.ImageField(required=False)
segmentation_steps = forms.ChoiceField(choices=(
BINARIZER_CHOICES = (('kraken', _("Kraken")),)
binarizer = forms.ChoiceField(required=False,
choices=BINARIZER_CHOICES,
initial='kraken')
# segment
SEGMENTATION_STEPS_CHOICES = (
('regions', _('Regions')),
('lines', _('Lines')),
('both', _('Lines and regions'))
), initial='lines', required=False)
new_model = forms.CharField(required=False, label=_('Name'))
('both', _('Lines and regions')))
segmentation_steps = forms.ChoiceField(choices=SEGMENTATION_STEPS_CHOICES,
initial='lines', required=False)
TEXT_DIRECTION_CHOICES = (('horizontal-lr', _("Horizontal l2r")),
('horizontal-rl', _("Horizontal r2l")),
('vertical-lr', _("Vertical l2r")),
('vertical-rl', _("Vertical r2l")))
text_direction = forms.ChoiceField(initial='horizontal-lr', required=False,
choices=TEXT_DIRECTION_CHOICES)
# transcribe
upload_model = forms.FileField(required=False,
validators=[FileExtensionValidator(
allowed_extensions=['mlmodel', 'pronn', 'clstm'])])
ocr_model = forms.ModelChoiceField(queryset=OcrModel.objects.all(), label=_("Model"), required=False)
class Meta:
model = DocumentProcessSettings
fields = '__all__'
# train
new_model = forms.CharField(required=False, label=_('Model name'))
train_model = forms.ModelChoiceField(queryset=OcrModel.objects.all(), label=_("Model"), required=False)
transcription = forms.ModelChoiceField(queryset=Transcription.objects.all(), required=False)
# typology = forms.ModelChoiceField(Typology, required=False,
# limit_choices_to={'target': Typology.TARGET_PART})
def __init__(self, document, user, *args, **kwargs):
self.user = user
self.document = document
self.user = user
super().__init__(*args, **kwargs)
# self.fields['typology'].widget = forms.HiddenInput() # for now
# self.fields['typology'].initial = Typology.objects.get(name="Page")
......@@ -121,12 +142,12 @@ class DocumentProcessForm(BootstrapFormMixin, forms.ModelForm):
if self.document.read_direction == self.document.READ_DIRECTION_RTL:
self.initial['text_direction'] = 'horizontal-rl'
self.fields['binarizer'].widget.attrs['disabled'] = True
self.fields['binarizer'].required = False
self.fields['text_direction'].required = False
self.fields['train_model'].queryset = OcrModel.objects.filter(document=self.document)
self.fields['ocr_model'].queryset = OcrModel.objects.filter(
Q(document=None) | Q(document=self.document), trained=True)
Q(document=None, script=document.main_script)
| Q(document=self.document))
self.fields['transcription'].queryset = Transcription.objects.filter(document=self.document)
@cached_property
def parts(self):
pks = self.data.getlist('parts')
......@@ -147,9 +168,15 @@ class DocumentProcessForm(BootstrapFormMixin, forms.ModelForm):
raise forms.ValidationError(_("Uploaded image should be black and white."))
isize = (self.parts[0].image.width, self.parts[0].image.height)
if fh.size != isize:
raise forms.ValidationError(_("Uploaded image should be the same size as original image {}.").format(isize))
raise forms.ValidationError(_("Uploaded image should be the same size as original image {size}.").format(size=isize))
return img
def clean_train_model(self):
model = self.cleaned_data['train_model']
if model and model.training:
raise AlreadyProcessingException
return model
def process(self):
task = self.cleaned_data.get('task')
if task == self.TASK_BINARIZE:
......@@ -159,33 +186,35 @@ class DocumentProcessForm(BootstrapFormMixin, forms.ModelForm):
else:
for part in self.parts:
part.task('binarize', user_pk=self.user.pk)
elif task == self.TASK_SEGMENT:
for part in self.parts:
part.task('segment',
user_pk=self.user.pk,
steps=self.cleaned_data['segmentation_steps'],
text_direction=self.cleaned_data['text_direction'])
elif task == self.TASK_TRAIN:
if self.cleaned_data.get('new_model'):
# create model and corresponding OcrModel
pass
# part.train(user_pk=self.user.pk, model=None)
elif task == self.TASK_TRANSCRIBE:
if self.cleaned_data.get('upload_model'):
model = OcrModel.objects.create(
document=self.parts[0].document,
owner=self.user,
name=self.cleaned_data['upload_model'].name,
file=self.cleaned_data['upload_model'],
trained=True, document=self.parts[0].document)
self.instance.ocr_model = model # save to settings
file=self.cleaned_data['upload_model'])
elif self.cleaned_data['ocr_model']:
model = self.cleaned_data['ocr_model']
else:
model = None
for part in self.parts:
part.task('transcribe', user_pk=self.user.pk, model_pk=model and model.pk or None)
self.save() # save settings
elif task == self.TASK_TRAIN:
model = self.cleaned_data.get('upload_model') or self.cleaned_data.get('train_model')
OcrModel.train(self.parts,
self.cleaned_data['transcription'],
model=model,
model_name=self.cleaned_data['new_model'],
user=self.user)
class UploadImageForm(BootstrapFormMixin, forms.ModelForm):
......
# Generated by Django 2.1.4 on 2019-05-20 08:51
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('core', '0022_documentpart_original_filename'),
]
operations = [
migrations.AddField(
model_name='ocrmodel',
name='script',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='core.Script'),
),
]
# Generated by Django 2.1.4 on 2019-05-20 11:38
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0023_ocrmodel_script'),
]
operations = [
migrations.RemoveField(
model_name='documentprocesssettings',
name='document',
),
migrations.RemoveField(
model_name='documentprocesssettings',
name='ocr_model',
),
migrations.RemoveField(
model_name='documentprocesssettings',
name='train_model',
),
migrations.RemoveField(
model_name='documentprocesssettings',
name='typology',
),
migrations.DeleteModel(
name='DocumentProcessSettings',
),
]
# Generated by Django 2.1.4 on 2019-05-20 13:41
import core.models
import django.contrib.postgres.fields.jsonb
import django.core.validators
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
import uuid
class Migration(migrations.Migration):
dependencies = [
('core', '0024_auto_20190520_1138'),
]
operations = [
migrations.AddField(
model_name='ocrmodel',
name='revision',
field=models.UUIDField(default=uuid.uuid4, editable=False),
),
migrations.AddField(
model_name='ocrmodel',
name='training_accuracy',
field=models.FloatField(default=0.0),
),
migrations.AddField(
model_name='ocrmodel',
name='training_errors',
field=models.IntegerField(default=0),
),
migrations.AddField(
model_name='ocrmodel',
name='training_total',
field=models.IntegerField(default=0),
),
migrations.AddField(
model_name='ocrmodel',
name='version_author',
field=models.CharField(default='unknown', editable=False, max_length=128),
preserve_default=False,
),
migrations.AddField(
model_name='ocrmodel',
name='version_created_at',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
preserve_default=False,
),
migrations.AddField(
model_name='ocrmodel',
name='version_source',
field=models.CharField(default='escriptorium', editable=False, max_length=128),
),
migrations.AddField(
model_name='ocrmodel',
name='version_updated_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='ocrmodel',
name='versions',
field=django.contrib.postgres.fields.jsonb.JSONField(default=list, editable=False),
),
migrations.AlterField(
model_name='ocrmodel',
name='document',
field=models.ForeignKey(blank=True, default=None, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='ocr_models', to='core.Document'),
),
migrations.AlterField(
model_name='ocrmodel',
name='file',
field=models.FileField(upload_to=core.models.models_path, validators=[django.core.validators.FileExtensionValidator(allowed_extensions=['mlmodel'])]),
),
]
# Generated by Django 2.1.4 on 2019-05-21 09:25
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
('core', '0025_auto_20190520_1341'),
]
operations = [
migrations.RemoveField(
model_name='ocrmodel',
name='trained',
),
migrations.AddField(
model_name='ocrmodel',
name='owner',
field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL),
),
]
# Generated by Django 2.1.4 on 2019-05-21 11:30
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0026_auto_20190521_0925'),
]
operations = [
migrations.AddField(
model_name='ocrmodel',
name='training',
field=models.BooleanField(default=False),
),
]
# Generated by Django 2.1.4 on 2019-05-27 11:23
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0027_ocrmodel_training'),
]
operations = [
migrations.AlterModelOptions(
name='ocrmodel',
options={'ordering': ('-version_updated_at',)},
),
]
# Generated by Django 2.1.4 on 2019-05-28 12:55
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0028_auto_20190527_1123'),
]
operations = [
migrations.AddField(
model_name='ocrmodel',
name='training_epoch',
field=models.PositiveSmallIntegerField(default=0),
),
]
import re
import math
import logging
import os.path
import os
import functools
import subprocess
from PIL import Image
......@@ -17,12 +17,14 @@ from django.core.files.storage import FileSystemStorage
from django.core.validators import FileExtensionValidator
from django.dispatch import receiver
from django.utils.functional import cached_property
from django.utils.translation import gettext as _
from django.utils.text import slugify
from django.utils.translation import gettext_lazy as _
from django.db.models.signals import pre_delete
from celery.result import AsyncResult
from celery.task.control import inspect, revoke
from celery import chain, group
from celery import chain, group, chord
from easy_thumbnails.files import get_thumbnailer, generate_all_aliases
from ordered_model.models import OrderedModel
......@@ -216,7 +218,10 @@ class Document(models.Model):
return 'rtl'
else:
return 'ltr'
@property
def is_training(self):
return self.ocr_models.filter(training=True).count() <= 0
def document_images_path(instance, filename):
return 'documents/%d/%s' % (instance.document.pk, filename)
......@@ -308,7 +313,7 @@ class DocumentPart(OrderedModel):
return 0
self.transcription_progress = min(int(transcribed / total * 100), 100)
def recalculate_ordering(self, line_level_treshold=1/100):
def recalculate_ordering(self, text_direction=None, line_level_treshold=1/100):
"""
Re-order the lines of the DocumentPart depending or text direction.
Beware 'text direction' is different from reading order,
......@@ -318,11 +323,9 @@ class DocumentPart(OrderedModel):
for which blocks should be considered on the same 'line',
in which case x is used.
"""
try:
text_direction = self.document.process_settings.text_direction[-2:]
except DocumentProcessSettings.DoesNotExist:
text_direction = 'lr'
text_direction = (text_direction
or (self.document.main_script and self.document.main_script.text_direction[-2:])
or 'lr')
def origin_pt(box):
if text_direction == 'rl':
return (box[2], box[1])
......@@ -583,7 +586,7 @@ class DocumentPart(OrderedModel):
self.workflow_state = self.WORKFLOW_STATE_SEGMENTED
self.save()
self.recalculate_ordering()
self.recalculate_ordering(text_direction=text_direction)
def transcribe(self, model=None, text_direction=None):
if model:
......@@ -592,17 +595,15 @@ class DocumentPart(OrderedModel):
document=self.document)
model_ = kraken_models.load_any(model.file.path)
lines = self.lines.all()
try:
text_direction = self.document.process_settings.text_direction
except DocumentProcessSettings.DoesNotExist:
text_direction = None
text_direction = (text_direction
or (self.document.main_script and self.document.main_script.text_direction)
or 'horizontal-lr')
with Image.open(self.bw_image.file.name) as im:
for line in lines:
it = rpred.rpred(
model_, im,
bounds={'boxes': [line.box],
'text_direction': text_direction or 'horizontal-lr',
'text_direction': text_direction,
'script_detection': False},
pad=16, # TODO: % of the image?
bidi_reordering=True)
......@@ -624,7 +625,7 @@ class DocumentPart(OrderedModel):
redis_.set('process-%d' % self.pk, json.dumps({tasks[-1].name: {"status": "pending"}}))
chain(*tasks).delay()
def task(self, task_name, **kwargs):
def task(self, task_name, commit=True, **kwargs):
if not self.tasks_finished():
raise AlreadyProcessingException
tasks = []
......@@ -643,16 +644,19 @@ class DocumentPart(OrderedModel):
or (tasks_order.index(task_name) > tasks_order.index('binarize')
and not self.binarized)):
tasks.append(binarize.si(self.pk, **kwargs))
if (task_name == 'segment'
or (tasks_order.index(task_name) > tasks_order.index('segment')
and not self.segmented)):
tasks.append(segment.si(self.pk, **kwargs))
if task_name == 'transcribe':
tasks.append(transcribe.si(self.pk, **kwargs))
self.chain_tasks(*tasks)
if commit:
self.chain_tasks(*tasks)
return tasks
class Block(OrderedModel, models.Model):
......@@ -729,6 +733,7 @@ class Transcription(models.Model):
DEFAULT_NAME = 'manual'
class Meta:
ordering = ('-updated_at',)
unique_together = (('name', 'document'),)
def __str__(self):
......@@ -759,43 +764,81 @@ class LineTranscription(Versioned, models.Model):