Commit 4002a570 authored by Robin Tissot's avatar Robin Tissot
Browse files

wip

parent 922906f7
......@@ -79,7 +79,7 @@ class DocumentViewSet(ModelViewSet):
form.save() # create the import
try:
form.process()
except ParseError as e:
except ParseError:
return self.form_error("Incorrectly formated file, couldn't parse it.")
return Response({'status': 'ok'})
else:
......
# Generated by Django 2.1.4 on 2020-10-09 10:15
import core.models
import django.contrib.postgres.fields.jsonb
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('core', '0041_create_external_ids'),
]
operations = [
migrations.AlterField(
model_name='block',
name='box',
field=django.contrib.postgres.fields.jsonb.JSONField(validators=[core.models.validate_polygon, core.models.validate_3_points]),
),
migrations.AlterField(
model_name='line',
name='baseline',
field=django.contrib.postgres.fields.jsonb.JSONField(blank=True, null=True, validators=[core.models.validate_polygon, core.models.validate_2_points]),
),
migrations.AlterField(
model_name='line',
name='block',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='lines', to='core.Block'),
),
migrations.AlterField(
model_name='line',
name='mask',
field=django.contrib.postgres.fields.jsonb.JSONField(blank=True, null=True, validators=[core.models.validate_polygon, core.models.validate_3_points]),
),
]
from datetime import datetime
import json
import os
import io
import requests
from zipfile import ZipFile
from django import forms
from django.core.files.base import ContentFile
from django.db.models import Prefetch
from django.http import StreamingHttpResponse, HttpResponse
from django.template import loader
from django.utils.translation import gettext as _
from django.utils.text import slugify
from bootstrap.forms import BootstrapFormMixin
from core.models import Transcription, LineTranscription
from core.models import Transcription
from imports.models import DocumentImport
from imports.parsers import make_parser, ParseError
from imports.tasks import document_import, document_export
from reporting.models import TaskReport
from users.consumers import send_event
class ImportForm(BootstrapFormMixin, forms.Form):
......@@ -98,11 +93,11 @@ class ImportForm(BootstrapFormMixin, forms.Form):
self.instance = self.current_import
else:
imp = DocumentImport(
document = self.document,
document=self.document,
name=self.cleaned_data['name'],
override=self.cleaned_data['override'],
total=self.cleaned_data['total'], # added to the dict by clean_*()
started_by = self.user)
started_by=self.user)
if self.cleaned_data.get('iiif_uri'):
content = self.cleaned_data.get('iiif_uri')
imp.import_file.save(
......@@ -111,12 +106,22 @@ class ImportForm(BootstrapFormMixin, forms.Form):
elif self.cleaned_data.get('upload_file'):
imp.import_file = self.cleaned_data.get('upload_file')
# create a report and link to it
report = TaskReport.objects.create(
label='Import',
user=self.user)
imp.report = report
imp.save()
self.instance = imp
return self.instance
def process(self):
document_import.delay(self.instance.pk)
send_event('document', self.document.pk, "import:queued", {
"id": self.document.pk
})
class ExportForm(BootstrapFormMixin, forms.Form):
......@@ -153,5 +158,8 @@ class ExportForm(BootstrapFormMixin, forms.Form):
parts = self.cleaned_data['parts']
file_format = self.cleaned_data['file_format']
transcription = self.cleaned_data['transcription']
report = TaskReport.objects.create(user=self.user, label='Export')
document_export.delay(file_format, self.user.pk, self.document.pk,
parts, transcription.pk, self.cleaned_data['include_images'])
parts, transcription.pk, report.pk,
include_images=self.cleaned_data['include_images'])
# Generated by Django 2.1.4 on 2020-10-09 10:15
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('reporting', '0001_initial'),
('imports', '0010_auto_20191015_0917'),
]
operations = [
migrations.RemoveField(
model_name='documentimport',
name='task_id',
),
migrations.AddField(
model_name='documentimport',
name='report',
field=models.ForeignKey(blank=True, max_length=64, null=True, on_delete=django.db.models.deletion.CASCADE, to='reporting.TaskReport'),
),
]
......@@ -8,6 +8,7 @@ from escriptorium.celery import app
from core.models import Document
from users.models import User
from imports.parsers import make_parser, XML_EXTENSIONS
from reporting.models import TaskReport
class DocumentImport(models.Model):
......@@ -39,7 +40,8 @@ class DocumentImport(models.Model):
validators=[FileExtensionValidator(
allowed_extensions=XML_EXTENSIONS + ['json'])])
task_id = models.CharField(max_length=64, blank=True) # celery task id
report = models.ForeignKey(TaskReport, max_length=64, null=True, blank=True,
on_delete=models.CASCADE)
processed = models.PositiveIntegerField(default=0)
total = models.PositiveIntegerField(default=None, null=True, blank=True)
......@@ -74,7 +76,8 @@ class DocumentImport(models.Model):
self.save()
start_at = resume and self.processed or 0
parser = make_parser(self.document, self.import_file, name=self.name)
parser = make_parser(self.document, self.import_file,
name=self.name, report=self.report)
for obj in parser.parse(start_at=start_at,
override=self.override,
user=self.started_by):
......@@ -87,5 +90,6 @@ class DocumentImport(models.Model):
except Exception as e:
self.workflow_state = self.WORKFLOW_STATE_ERROR
self.error_message = str(e)[:512]
self.report.error(str(e))
self.save()
raise e
......@@ -27,6 +27,7 @@ from versioning.models import NoChangeException
logger = logging.getLogger(__name__)
XML_EXTENSIONS = ["xml", "alto"] # , 'abbyy'
OWN_RISK = "the validity of the data can not be automaatically checked, use at your own risks."
class ParseError(Exception):
......@@ -41,9 +42,10 @@ class ParserDocument:
DEFAULT_NAME = None
def __init__(self, document, file_handler, transcription_name=None):
def __init__(self, document, file_handler, report, transcription_name=None):
self.file = file_handler
self.document = document
self.report = report
self.name = transcription_name or self.DEFAULT_NAME
@property
......@@ -96,16 +98,17 @@ class ZipParser(ParserDocument):
continue
with zfh.open(finfo) as zipedfh:
parser = make_parser(self.document, zipedfh,
name=self.name)
name=self.name, report=self.report)
try:
for part in parser.parse(override=override):
yield part
except ParseError as e:
# we let go to try other documents
msg = _("Parse error in {filename}: {error}").format(
msg = _("Parse error in {filename}: {error}, skipping it.").format(
filename=self.file.name, error=e.args[0]
)
logger.warning(msg)
self.report.append(msg)
if user:
user.notify(msg, id="import:warning", level="warning")
......@@ -113,7 +116,7 @@ class ZipParser(ParserDocument):
class XMLParser(ParserDocument):
ACCEPTED_SCHEMAS = ()
def __init__(self, document, file_handler, transcription_name=None, xml_root=None):
def __init__(self, document, file_handler, report, transcription_name=None, xml_root=None):
if xml_root is not None:
self.root = xml_root
try:
......@@ -122,14 +125,18 @@ class XMLParser(ParserDocument):
namespaces={"xsi": "http://www.w3.org/2001/XMLSchema-instance"},
)[0].split(" ")[-1]
except (etree.XPathEvalError, IndexError) as e:
raise ParseError("Cannot Find Schema location %s" % e.args[0])
message = "Cannot Find Schema location %s, %s" % (e.args[0], OWN_RISK)
if report:
report.append(message)
else:
raise ParseError(message)
else:
try:
self.root = etree.parse(self.file).getroot()
except (AttributeError, etree.XMLSyntaxError) as e:
raise ParseError("Invalid XML. %s" % e.args[0])
super().__init__(document, file_handler, transcription_name=transcription_name)
super().__init__(document, file_handler, transcription_name=transcription_name,
report=report)
def validate(self):
if self.schema_location in self.ACCEPTED_SCHEMAS:
......@@ -139,7 +146,8 @@ class XMLParser(ParserDocument):
schema_root = etree.XML(content)
except requests.exceptions.RequestException as e:
logger.exception(e)
raise ParseError("Can't reach validation document %s." % self.schema_location)
self.report.append("Can't reach validation document %s, %s" % (self.schema_location, OWN_RISK))
else:
try:
xmlschema = etree.XMLSchema(schema_root)
......@@ -149,10 +157,10 @@ class XMLParser(ParserDocument):
etree.DocumentInvalid,
etree.XMLSyntaxError,
) as e:
raise ParseError("Document didn't validate. %s" % e.args[0])
self.report.append("Document didn't validate. %s, %s" % (e.args[0], OWN_RISK))
else:
raise ParseError("Document Schema not valid %s. Valid schemas are: %s" %
(self.schema_location, self.ACCEPTED_SCHEMAS))
self.report.append("Document Schema %s is not in the accepted escriptiium list. Valid schemas are: %s, %s" %
(self.schema_location, self.ACCEPTED_SCHEMAS, OWN_RISK))
def get_filename(self, pageTag):
raise NotImplementedError
......@@ -205,7 +213,7 @@ class XMLParser(ParserDocument):
)[0]
except IndexError:
# TODO: check for the image in the zip
raise ParseError(
self.report.append(
_("No match found for file {} with filename \"{}\".").format(
self.file.name, filename
)
......@@ -232,7 +240,7 @@ class XMLParser(ParserDocument):
try:
block.full_clean()
except ValidationError as e:
raise ParseError(
self.report.append(
"Block in '{filen}' line N°{line} was skipped because: {error}".format(
filen=self.file.name, line=blockTag.sourceline, error=e))
else:
......@@ -257,8 +265,8 @@ class XMLParser(ParserDocument):
try:
line.full_clean()
except ValidationError as e:
raise ParseError("Line in '{filen}' line N°{line} was skipped because: {error}".format(
filen=self.file.name, line=blockTag.sourceline, error=e))
self.report.append("Line in '{filen}' line N°{line} was skipped because: {error}".format(
filen=self.file.name, line=blockTag.sourceline, error=e))
else:
line.save()
......@@ -281,6 +289,7 @@ class AltoParser(XMLParser):
"http://www.loc.gov/standards/alto/v4/alto.xsd",
"http://www.loc.gov/standards/alto/v4/alto-4-0.xsd",
"http://www.loc.gov/standards/alto/v4/alto-4-1.xsd",
"http://www.loc.gov/standards/alto/v4/alto-4-2.xsd",
escriptorium_alto
)
......@@ -356,7 +365,9 @@ The alto file should contain a Description/sourceImageInformation/fileName tag f
coords = tuple(map(float, baseline.split(" ")))
line.baseline = tuple(zip(coords[::2], coords[1::2]))
except ValueError:
logger.warning("Invalid baseline %s" % baseline)
msg = "Invalid baseline %s in {filen} line {linen}" % (baseline, self.file.name, lineTag.sourceline)
logger.warning(msg)
self.report.append(msg)
polygon = lineTag.find("Shape/Polygon", self.root.nsmap)
if polygon is not None:
......@@ -364,7 +375,9 @@ The alto file should contain a Description/sourceImageInformation/fileName tag f
coords = tuple(map(float, polygon.get("POINTS").split(" ")))
line.mask = tuple(zip(coords[::2], coords[1::2]))
except ValueError:
logger.warning("Invalid polygon %s" % polygon)
msg = "Invalid polygon %s in {filen} line {linen}" % (polygon, self.file.name, lineTag.sourceline)
logger.warning(msg)
self.report.append(msg)
else:
line.box = [
int(lineTag.get("HPOS")),
......@@ -592,7 +605,7 @@ class TranskribusPageXmlParser(PagexmlParser):
]
def make_parser(document, file_handler, name=None):
def make_parser(document, file_handler, name=None, report=None):
# TODO: not great to rely on file name extension
ext = os.path.splitext(file_handler.name)[1][1:]
if ext in XML_EXTENSIONS:
......@@ -610,16 +623,16 @@ def make_parser(document, file_handler, name=None):
# return AbbyyParser(root, name=name)
if "alto" in schema:
return AltoParser(
document, file_handler, transcription_name=name, xml_root=root
document, file_handler, report, transcription_name=name, xml_root=root
)
elif "PAGE" in schema:
if b"Transkribus" in etree.tostring(root):
return TranskribusPageXmlParser(
document, file_handler, transcription_name=name, xml_root=root
document, file_handler, report, transcription_name=name, xml_root=root
)
else:
return PagexmlParser(
document, file_handler, transcription_name=name, xml_root=root
document, file_handler, report, transcription_name=name, xml_root=root
)
else:
......@@ -627,9 +640,9 @@ def make_parser(document, file_handler, name=None):
"Couldn't determine xml schema, check the content of the root tag."
)
elif ext == "json":
return IIIFManifestParser(document, file_handler)
return IIIFManifestParser(document, file_handler, report)
elif ext == "zip":
return ZipParser(document, file_handler, transcription_name=name)
return ZipParser(document, file_handler, report, transcription_name=name)
else:
raise ValueError(
"Invalid extension for the file to be parsed %s." % file_handler.name
......
......@@ -5,6 +5,7 @@ from zipfile import ZipFile
from django.apps import apps
from django.conf import settings
from django.urls import reverse
from django.db.models import Q, Prefetch
from django.template import loader
from django.utils.text import slugify
......@@ -29,7 +30,7 @@ def document_import(task, import_pk, resume=True, task_id=None):
pk=import_pk)
try:
imp.task_id = task.request.id
imp.report.start(task.request.id)
send_event('document', imp.document.pk, "import:start", {
"id": imp.document.pk
......@@ -47,14 +48,17 @@ def document_import(task, import_pk, resume=True, task_id=None):
"reason": str(e)
})
logger.exception(e)
imp.report.error(str(e))
else:
send_event('document', imp.document.pk, "import:done", {
"id": imp.document.pk
})
imp.report.end()
@shared_task(bind=True)
def document_export(task, file_format, user_pk, document_pk, part_pks, transcription_pk, include_images=False):
def document_export(task, file_format, user_pk, document_pk, part_pks,
transcription_pk, report_pk, include_images=False):
ALTO_FORMAT = "alto"
PAGEXML_FORMAT = "pagexml"
TEXT_FORMAT = "text"
......@@ -64,10 +68,13 @@ def document_export(task, file_format, user_pk, document_pk, part_pks, transcrip
DocumentPart = apps.get_model('core', 'DocumentPart')
Transcription = apps.get_model('core', 'Transcription')
LineTranscription = apps.get_model('core', 'LineTranscription')
TaskReport = apps.get_model('reporting', 'TaskReport')
user = User.objects.get(pk=user_pk)
document = Document.objects.get(pk=document_pk)
report = TaskReport.objects.get(pk=report_pk)
report.start(task.request.id)
send_event('document', document.pk, "export:start", {
"id": document.pk
})
......@@ -103,27 +110,39 @@ def document_export(task, file_format, user_pk, document_pk, part_pks, transcrip
parts = DocumentPart.objects.filter(document=document, pk__in=part_pks)
with ZipFile(filepath, 'w') as zip_:
for part in parts:
page = tplt.render({
'valid_block_types': document.valid_block_types.all(),
'valid_line_types': document.valid_line_types.all(),
'part': part,
'blocks': (part.blocks.order_by('order')
.prefetch_related(
Prefetch(
'lines',
queryset=Line.objects.prefetch_transcription(transcription)))),
'orphan_lines': (part.lines.prefetch_transcription(transcription)
.filter(block=None))
})
zip_.writestr('%s.xml' % os.path.splitext(part.filename)[0], page)
try:
page = tplt.render({
'valid_block_types': document.valid_block_types.all(),
'valid_line_types': document.valid_line_types.all(),
'part': part,
'blocks': (part.blocks.order_by('order')
.prefetch_related(
Prefetch(
'lines',
queryset=Line.objects.prefetch_transcription(
transcription)))),
'orphan_lines': (part.lines.prefetch_transcription(transcription)
.filter(block=None))
})
except Exception as e:
report.append("Skipped {element}({image}) because '{reason}'.".format(
element=part.name, image=part.filename, reason=str(e)
))
else:
zip_.writestr('%s.xml' % os.path.splitext(part.filename)[0], page)
if include_images:
zip_.write(part.image.path, part.filename)
report.end()
zip_.close()
# send websocket msg
rel_path = os.path.relpath(filepath, settings.MEDIA_ROOT)
user.notify('Export ready!', level='success', link={'text': 'Download', 'src': rel_path})
report_uri = reverse('report-detail', kwargs={'pk': report.pk})
user.notify('Export ready!', level='success', links=[
{'text': 'Download', 'src': settings.MEDIA_URL + rel_path},
{'text': 'Report', 'src': report_uri},
])
send_event('document', document.pk, "export:done", {
"id": document.pk
......
from django.contrib import admin
from reporting.models import TaskReport
# class TaskReportAdmin():
# pass
admin.site.register(TaskReport)
from django.apps import AppConfig
class ReportingConfig(AppConfig):
name = 'reporting'
# Generated by Django 2.1.4 on 2020-10-09 10:15
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name='TaskReport',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('workflow_state', models.PositiveSmallIntegerField(choices=[(0, 'Queued'), (1, 'Running'), (2, 'Crashed'), (3, 'Finished')], default=0)),
('label', models.CharField(max_length=256)),
('messages', models.TextField(blank=True)),
('queued_at', models.DateTimeField(auto_now_add=True)),
('started_at', models.DateTimeField(null=True)),
('done_at', models.DateTimeField(null=True)),
('task_id', models.CharField(blank=True, max_length=64, null=True)),
('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
],
),
]
from datetime import datetime, timezone
from django.db import models
from django.utils.translation import gettext_lazy as _
from django.contrib.auth import get_user_model
User = get_user_model()
class TaskReport(models.Model):
WORKFLOW_STATE_QUEUED = 0
WORKFLOW_STATE_STARTED = 1
WORKFLOW_STATE_ERROR = 2
WORKFLOW_STATE_DONE = 3
WORKFLOW_STATE_CHOICES = (
(WORKFLOW_STATE_QUEUED, _("Queued")),
(WORKFLOW_STATE_STARTED, _("Running")),
(WORKFLOW_STATE_ERROR, _("Crashed")),
(WORKFLOW_STATE_DONE, _("Finished"))
)
workflow_state = models.PositiveSmallIntegerField(
default=WORKFLOW_STATE_QUEUED,
choices=WORKFLOW_STATE_CHOICES
)
label = models.CharField(max_length=256)
messages = models.TextField(blank=True)
queued_at = models.DateTimeField(auto_now_add=True)
started_at = models.DateTimeField(null=True)
done_at = models.DateTimeField(null=True)
user = models.ForeignKey(User, on_delete=models.CASCADE)
# celery task id
task_id = models.CharField(max_length=64, blank=True, null=True)
def append(self, text):
self.messages += text + '\n'
def start(self, task_id):
self.task_id = task_id
self.workflow_state = self.WORKFLOW_STATE_STARTED
self.started_at = datetime.now(timezone.utc)
self.save()
def error(self, message):
# unrecoverable error
self.workflow_state = self.WORKFLOW_STATE_ERROR
self.append(message)
self.save()
def end(self):
self.workflow_state = self.WORKFLOW_STATE_DONE
self.done_at = datetime.now(timezone.utc)
self.append('Done.')
self.save()
from django.test import TestCase
# Create your tests here.
from django.urls import path
from reporting.views import (ReportList, ReportDetail)
urlpatterns = [
path('reports/', ReportList.as_view(), name='report-list'),
path('reports/<int:pk>/', ReportDetail.as_view(), name='report-detail'),
]