From ecd1915bdc5cb19138bf4fa8b15285412b1d0212 Mon Sep 17 00:00:00 2001 From: Robin Tissot <tissotrobin@gmail.com> Date: Tue, 10 Nov 2020 14:03:13 +0100 Subject: [PATCH] Pass imports tests. --- app/apps/imports/forms.py | 6 -- app/apps/imports/mocks/test_single.alto | 130 +++++++++++++----------- app/apps/imports/models.py | 15 ++- app/apps/imports/parsers.py | 10 +- app/apps/imports/tests.py | 63 +++++++----- 5 files changed, 129 insertions(+), 95 deletions(-) diff --git a/app/apps/imports/forms.py b/app/apps/imports/forms.py index aab6b8eb..2989588a 100644 --- a/app/apps/imports/forms.py +++ b/app/apps/imports/forms.py @@ -106,12 +106,6 @@ class ImportForm(BootstrapFormMixin, forms.Form): elif self.cleaned_data.get('upload_file'): imp.import_file = self.cleaned_data.get('upload_file') - # create a report and link to it - report = TaskReport.objects.create( - label=_('Import in %(document_name)s') % {'document_name': self.document.name}, - user=self.user) - imp.report = report - imp.save() self.instance = imp diff --git a/app/apps/imports/mocks/test_single.alto b/app/apps/imports/mocks/test_single.alto index 8cd703e8..6a8f78b7 100644 --- a/app/apps/imports/mocks/test_single.alto +++ b/app/apps/imports/mocks/test_single.alto @@ -1,78 +1,90 @@ <?xml version="1.0" encoding="UTF-8"?> <alto xmlns="http://www.loc.gov/standards/alto/ns-v4#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v4# http://www.loc.gov/standards/alto/v4/alto-4-0.xsd"> + xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v4# http://www.loc.gov/standards/alto/v4/alto-4-0.xsd"> <Description> - <MeasurementUnit>pixel</MeasurementUnit> - <sourceImageInformation> - <fileName>test1.png</fileName> - </sourceImageInformation> - <OCRProcessing ID="OCR_0"> - <ocrProcessingStep> - <processingSoftware> - <softwareName>kraken</softwareName> - </processingSoftware> - </ocrProcessingStep> - </OCRProcessing> + <MeasurementUnit>pixel</MeasurementUnit> + <sourceImageInformation> + <fileName>test1.png</fileName> + </sourceImageInformation> + <OCRProcessing ID="OCR_0"> + <ocrProcessingStep> + <processingSoftware> + <softwareName>kraken</softwareName> + </processingSoftware> + </ocrProcessingStep> + </OCRProcessing> </Description> <Layout> <Page WIDTH="850" HEIGHT="1083" PHYSICAL_IMG_NR="0" ID="page_0"> - <PrintSpace HPOS="0" VPOS="0" WIDTH="850" HEIGHT="1083"> - <TextBlock HPOS="0" VPOS="0" ID="textblock_0" WIDTH="850" HEIGHT="1083"> - <TextLine ID="line_0" - HPOS="160" - VPOS="771" - WIDTH="60" - HEIGHT="28"> - <String ID="segment_0" - CONTENT="This" - HPOS="160" - VPOS="771" - WIDTH="20" - HEIGHT="28"></String> + <PrintSpace HPOS="0" VPOS="0" WIDTH="850" HEIGHT="1083"> + <TextBlock HPOS="0" VPOS="0" ID="textblock_0" WIDTH="850" HEIGHT="1083"> + <TextLine ID="line_0" + HPOS="160" + VPOS="771" + WIDTH="60" + HEIGHT="28"> + <String ID="segment_0" + CONTENT="This" + HPOS="160" + VPOS="771" + WIDTH="20" + HEIGHT="28"></String> <String ID="segment_1" - CONTENT="is" - HPOS="185" - VPOS="771" - WIDTH="10" - HEIGHT="28"></String> + CONTENT="is" + HPOS="185" + VPOS="771" + WIDTH="10" + HEIGHT="28"></String> <String ID="segment_2" - CONTENT="a" - HPOS="195" - VPOS="771" - WIDTH="5" - HEIGHT="28"></String> + CONTENT="a" + HPOS="195" + VPOS="771" + WIDTH="5" + HEIGHT="28"></String> <String ID="segment_3" - CONTENT="test" - HPOS="200" - VPOS="771" - WIDTH="20" - HEIGHT="28"></String> - </TextLine> + CONTENT="test" + HPOS="200" + VPOS="771" + WIDTH="20" + HEIGHT="28"></String> + </TextLine> <TextLine ID="line_1" - HPOS="160" - VPOS="800" - WIDTH="25" - HEIGHT="28"> + HPOS="160" + VPOS="800" + WIDTH="25" + HEIGHT="28"> <String ID="segment_1_0" - CONTENT="Line 2" - HPOS="160" - VPOS="771" - WIDTH="25" - HEIGHT="28"></String> + CONTENT="Line" + HPOS="160" + VPOS="771" + WIDTH="25" + HEIGHT="28"></String> + <String ID="segment_1_1" + CONTENT="2" + HPOS="185" + VPOS="771" + WIDTH="25" + HEIGHT="28"></String> </TextLine> <TextLine ID="line_2" - HPOS="160" - VPOS="830" - WIDTH="25" - HEIGHT="28"> + HPOS="160" + VPOS="830" + WIDTH="25" + HEIGHT="28"> <String ID="segment_2_0" - CONTENT="Line 3" - HPOS="160" - VPOS="771" - WIDTH="25" - HEIGHT="28"></String> + CONTENT="3" + HPOS="160" + VPOS="771" + WIDTH="25" + HEIGHT="28"></String> + <String ID="segment_2_1" + CONTENT="3" + HPOS="185" + VPOS="771" + WIDTH="25" + HEIGHT="28"></String> </TextLine> </TextBlock> </PrintSpace> diff --git a/app/apps/imports/models.py b/app/apps/imports/models.py index bc640695..0b7bd257 100644 --- a/app/apps/imports/models.py +++ b/app/apps/imports/models.py @@ -2,6 +2,7 @@ import os.path from django.core.validators import FileExtensionValidator from django.db import models +from django.utils.translation import gettext as _ from escriptorium.celery import app @@ -68,8 +69,18 @@ class DocumentImport(models.Model): self.workflow_state = self.WORKFLOW_STATE_ERROR self.error_message = 'canceled' self.save() - if self.task_id: - app.control.revoke(self.task_id, terminate=True) + if self.report and self.report.task_id: + app.control.revoke(self.report.task_id, terminate=True) + + def save(self, *args, **kwargs): + if not self.report: + # create a report and link to it + report = TaskReport.objects.create( + label=_('Import in %(document_name)s') % {'document_name': self.document.name}, + user=self.started_by or self.document.owner) + self.report = report + + super().save(*args, **kwargs) def process(self, resume=True): try: diff --git a/app/apps/imports/parsers.py b/app/apps/imports/parsers.py index 32830672..269bfab8 100644 --- a/app/apps/imports/parsers.py +++ b/app/apps/imports/parsers.py @@ -116,7 +116,7 @@ class ZipParser(ParserDocument): part.save() # xml - elif file_extension == 'xml': + elif file_extension in XML_EXTENSIONS: parser = make_parser(self.document, zipedfh, name=self.name, report=self.report) @@ -386,6 +386,7 @@ The alto file should contain a Description/sourceImageInformation/fileName tag f def update_line(self, line, lineTag): baseline = lineTag.get("BASELINE") + if baseline is not None: # sometimes baseline is just a single number, # an offset maybe it's not super clear @@ -401,6 +402,13 @@ The alto file should contain a Description/sourceImageInformation/fileName tag f logger.warning(msg) if self.report: self.report.append(msg) + else: + # extract it from <String>s then + strings = lineTag.findall("String", self.root.nsmap) + last_segment = strings[-1] + line.baseline = [(int(e.get('HPOS')), int(e.get('VPOS'))) for e in strings] + line.baseline.append((int(last_segment.get('HPOS'))+int(last_segment.get('WIDTH')), + int(last_segment.get('VPOS')))) polygon = lineTag.find("Shape/Polygon", self.root.nsmap) if polygon is not None: diff --git a/app/apps/imports/tests.py b/app/apps/imports/tests.py index 3abf724a..9a934275 100644 --- a/app/apps/imports/tests.py +++ b/app/apps/imports/tests.py @@ -11,6 +11,7 @@ from imports.models import DocumentImport from imports.parsers import AltoParser, IIIFManifestParser from core.models import Block, Line, Transcription, LineTranscription, BlockType, LineType from core.tests.factory import CoreFactoryTestCase +from reporting.models import TaskReport class XmlImportTestCase(CoreFactoryTestCase): @@ -36,7 +37,7 @@ class XmlImportTestCase(CoreFactoryTestCase): filename = 'test_single.alto' mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename) with open(mock_path, 'rb') as fh: - with self.assertNumQueries(13): + with self.assertNumQueries(20): response = self.client.post(uri, { 'upload_file': SimpleUploadedFile(filename, fh.read()) }) @@ -51,17 +52,14 @@ class XmlImportTestCase(CoreFactoryTestCase): # import was created since the form validated imp = DocumentImport.objects.first() - self.assertTrue(imp.error_message, 'No match found') - - self.part1.original_filename = 'test1.png' - self.part1.save() + self.assertTrue(imp.report.messages.startswith('No match found for file import_src/test_single_')) def test_alto_invalid_xml(self): uri = reverse('api:document-imports', kwargs={'pk': self.document.pk}) filename = 'test_invalid_alto.xml' mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename) with open(mock_path, 'rb') as fh: - with self.assertNumQueries(6): + with self.assertNumQueries(8): response = self.client.post(uri, { 'upload_file': SimpleUploadedFile(filename, fh.read()) }) @@ -72,7 +70,7 @@ class XmlImportTestCase(CoreFactoryTestCase): filename = 'test_single.alto' mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename) with open(mock_path, 'rb') as fh: - with self.assertNumQueries(47): + with self.assertNumQueries(54): response = self.client.post(uri, { 'upload_file': SimpleUploadedFile(filename, fh.read()) }) @@ -86,7 +84,7 @@ class XmlImportTestCase(CoreFactoryTestCase): self.assertEqual(self.part1.blocks.count(), 1) self.assertEqual(self.part1.blocks.first().box, [[0, 0], [850, 0], [850, 1083], [0, 1083]]) self.assertEqual(self.part1.lines.count(), 3) - self.assertEqual(self.part1.lines.first().box, [160, 771, 220, 799]) + self.assertEqual(self.part1.lines.first().box, [160, 771, 220, 771]) self.assertEqual(self.part1.lines.first().transcriptions.first().content, 'This is a test') self.assertEqual(self.part2.blocks.count(), 0) self.assertEqual(self.part2.lines.count(), 0) @@ -96,7 +94,7 @@ class XmlImportTestCase(CoreFactoryTestCase): filename = 'test_single_baselines.alto' mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename) with open(mock_path, 'rb') as fh: - with self.assertNumQueries(33): + with self.assertNumQueries(40): response = self.client.post(uri, { 'upload_file': SimpleUploadedFile(filename, fh.read()) }) @@ -120,7 +118,7 @@ class XmlImportTestCase(CoreFactoryTestCase): filename = 'test.zip' mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename) with open(mock_path, 'rb') as fh: - with self.assertNumQueries(65): + with self.assertNumQueries(72): response = self.client.post(uri, { 'upload_file': SimpleUploadedFile(filename, fh.read()) }) @@ -130,6 +128,7 @@ class XmlImportTestCase(CoreFactoryTestCase): self.assertEqual(DocumentImport.objects.count(), 1) self.assertEqual(DocumentImport.objects.first().workflow_state, DocumentImport.WORKFLOW_STATE_DONE) + self.assertEqual(self.part1.blocks.count(), 1) self.assertEqual(self.part1.lines.count(), 3) self.assertEqual(self.part2.blocks.count(), 1) @@ -144,7 +143,7 @@ class XmlImportTestCase(CoreFactoryTestCase): filename = 'test_composedblock.alto' mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename) with open(mock_path, 'rb') as fh: - with self.assertNumQueries(59): + with self.assertNumQueries(66): response = self.client.post(uri, { 'upload_file': SimpleUploadedFile(filename, fh.read()) }) @@ -170,6 +169,7 @@ class XmlImportTestCase(CoreFactoryTestCase): with open(mock_path, 'rb') as fh: imp = DocumentImport.objects.create( document=self.document, + started_by=self.document.owner, import_file=ContentFile( fh.read(), name=os.path.join( @@ -189,7 +189,8 @@ class XmlImportTestCase(CoreFactoryTestCase): b = Block.objects.create(document_part=self.part1, external_id="textblock_0", box=[[0, 0], [100, 100]]) l = Line.objects.create(document_part=self.part1, block=b, external_id="line_0", - box=[10, 10, 50, 20]) + baseline=((5, 5), (5, 10)), + mask=((0, 0), (0, 10), (10, 0), (10,10))) lt = LineTranscription.objects.create(transcription=trans, line=l) uri = reverse('api:document-imports', kwargs={'pk': self.document.pk}) @@ -221,12 +222,13 @@ class XmlImportTestCase(CoreFactoryTestCase): b = Block.objects.create(document_part=self.part1, external_id="textblock_0", box=[[0, 0], [100, 100]]) l = Line.objects.create(document_part=self.part1, block=b, external_id="line_0", - box=[10, 10, 50, 20]) + baseline=((5, 5), (5, 10)), + mask=((0, 0), (0, 10), (10, 0), (10,10))) lt = LineTranscription.objects.create(transcription=trans, line=l, content="test history") # historic line without external_id b2 = Block.objects.create(document_part=self.part1, box=[[0, 0], [100, 100]]) - l2 = Line.objects.create(document_part=self.part1, block=b2, box=[10, 10, 50, 20]) + l2 = Line.objects.create(document_part=self.part1, block=b2, baseline=((10, 10), (50, 20))) lt2 = LineTranscription.objects.create(transcription=trans, line=l2, content="test dummy") uri = reverse('api:document-imports', kwargs={'pk': self.document.pk}) @@ -263,7 +265,7 @@ class XmlImportTestCase(CoreFactoryTestCase): filename = 'pagexml_test.xml' mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename) with open(mock_path, 'rb') as fh: - with self.assertNumQueries(13): + with self.assertNumQueries(20): response = self.client.post(uri, {'upload_file': SimpleUploadedFile(filename, fh.read())}) # Note: the ParseError is raised by the processing of the import, @@ -276,7 +278,7 @@ class XmlImportTestCase(CoreFactoryTestCase): self.assertEqual(self.part3.lines.count(), 0) # import was created since the form validated imp = DocumentImport.objects.first() - self.assertTrue(imp.error_message, 'No match found') + self.assertTrue(imp.report.messages.startswith('No match found')) self.part3.original_filename = 'test3.png' self.part3.save() @@ -285,7 +287,8 @@ class XmlImportTestCase(CoreFactoryTestCase): block = Block.objects.create(document_part=self.part3, external_id="r2", box=[[0, 0], [100, 100]]) l = Line.objects.create(document_part=self.part3, block=block, external_id="r2l1", - box=[10, 10, 50, 20]) + baseline=((5, 5), (5, 10)), + mask=((0, 0), (0, 10), (10, 0), (10,10))) lt = LineTranscription.objects.create(transcription=trans, line=l) uri = reverse('api:document-imports', kwargs={'pk': self.document.pk}) @@ -307,8 +310,11 @@ class XmlImportTestCase(CoreFactoryTestCase): trans = Transcription.objects.create(name="test import", document=self.document) block = Block.objects.create(document_part=self.part3, external_id="r2", box=[[0, 0], [100, 100]]) - line = Line.objects.create(document_part=self.part3, block=block, external_id="r2l1", - box=[10, 10, 50, 20]) + line = Line.objects.create(document_part=self.part3, + block=block, + external_id="r2l1", + baseline=((5, 5), (5, 10)), + mask=((0, 0), (0, 10), (10, 0), (10,10))) lt = LineTranscription.objects.create(transcription=trans, line=line) uri = reverse('api:document-imports', kwargs={'pk': self.document.pk}) @@ -333,7 +339,7 @@ class XmlImportTestCase(CoreFactoryTestCase): filename = 'test_pagexml.zip' mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename) with open(mock_path, 'rb') as fh: - with self.assertNumQueries(406): # theres a lot of lines in there + with self.assertNumQueries(413): # theres a lot of lines in there response = self.client.post(uri, { 'upload_file': SimpleUploadedFile(filename, fh.read()) }) @@ -362,7 +368,7 @@ class XmlImportTestCase(CoreFactoryTestCase): filename = 'test_pagexml_types.xml' mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename) with open(mock_path, 'rb') as fh: - with self.assertNumQueries(67): + with self.assertNumQueries(74): response = self.client.post(uri, { 'upload_file': SimpleUploadedFile(filename, fh.read()) }) @@ -398,7 +404,7 @@ class XmlImportTestCase(CoreFactoryTestCase): # we don't go through the form but we want to test json validation fh.seek(0) - IIIFManifestParser(self.document, fh).validate() + IIIFManifestParser(self.document, fh, imp.report).validate() imp.save() @@ -424,6 +430,7 @@ class XmlImportTestCase(CoreFactoryTestCase): document=self.document, import_file=ContentFile('', name='doesntmatter.xml'), workflow_state=DocumentImport.WORKFLOW_STATE_STARTED, + started_by=self.document.owner, processed=0) uri = reverse('api:document-cancel-import', kwargs={'pk': self.document.pk}) response = self.client.post(uri) @@ -447,7 +454,8 @@ class DocumentExportTestCase(CoreFactoryTestCase): self.parts.append(part) for j in range(1, 4): l = Line.objects.create(document_part=part, - box=(0, 0, 1, 1)) + baseline=((5, 5), (5, 10)), + mask=((0, 0), (0, 10), (10, 0), (10,10))) LineTranscription.objects.create( line=l, transcription=self.trans, @@ -455,7 +463,7 @@ class DocumentExportTestCase(CoreFactoryTestCase): def test_simple(self): self.client.force_login(self.user) - with self.assertNumQueries(10): + with self.assertNumQueries(17): response = self.client.post(reverse('api:document-export', kwargs={'pk': self.trans.document.pk}), {'transcription': self.trans.pk, @@ -468,7 +476,7 @@ class DocumentExportTestCase(CoreFactoryTestCase): def test_alto(self): self.client.force_login(self.user) - with self.assertNumQueries(19): + with self.assertNumQueries(28): response = self.client.post(reverse('api:document-export', kwargs={'pk': self.trans.document.pk}), {'transcription': self.trans.pk, @@ -485,13 +493,14 @@ class DocumentExportTestCase(CoreFactoryTestCase): for j in range(1, 4): l = Line.objects.create(document_part=part, block=block, - box=(0, 0, 1, 1)) + baseline=((5, 5), (5, 10)), + mask=((0, 0), (0, 10), (10, 0), (10,10))) LineTranscription.objects.create( line=l, transcription=self.trans, content='line %d:%d' % (i, j)) self.client.force_login(self.user) - with self.assertNumQueries(10): + with self.assertNumQueries(17): response = self.client.post(reverse('api:document-export', kwargs={'pk': self.trans.document.pk}), {'transcription': self.trans.pk, -- GitLab