From ecd1915bdc5cb19138bf4fa8b15285412b1d0212 Mon Sep 17 00:00:00 2001
From: Robin Tissot <tissotrobin@gmail.com>
Date: Tue, 10 Nov 2020 14:03:13 +0100
Subject: [PATCH] Pass imports tests.

---
 app/apps/imports/forms.py               |   6 --
 app/apps/imports/mocks/test_single.alto | 130 +++++++++++++-----------
 app/apps/imports/models.py              |  15 ++-
 app/apps/imports/parsers.py             |  10 +-
 app/apps/imports/tests.py               |  63 +++++++-----
 5 files changed, 129 insertions(+), 95 deletions(-)

diff --git a/app/apps/imports/forms.py b/app/apps/imports/forms.py
index aab6b8eb..2989588a 100644
--- a/app/apps/imports/forms.py
+++ b/app/apps/imports/forms.py
@@ -106,12 +106,6 @@ class ImportForm(BootstrapFormMixin, forms.Form):
             elif self.cleaned_data.get('upload_file'):
                 imp.import_file = self.cleaned_data.get('upload_file')
 
-            # create a report and link to it
-            report = TaskReport.objects.create(
-                label=_('Import in %(document_name)s') % {'document_name': self.document.name},
-                user=self.user)
-            imp.report = report
-
             imp.save()
             self.instance = imp
 
diff --git a/app/apps/imports/mocks/test_single.alto b/app/apps/imports/mocks/test_single.alto
index 8cd703e8..6a8f78b7 100644
--- a/app/apps/imports/mocks/test_single.alto
+++ b/app/apps/imports/mocks/test_single.alto
@@ -1,78 +1,90 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <alto xmlns="http://www.loc.gov/standards/alto/ns-v4#"
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-	  xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v4# http://www.loc.gov/standards/alto/v4/alto-4-0.xsd">
+      xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v4# http://www.loc.gov/standards/alto/v4/alto-4-0.xsd">
   <Description>
-	<MeasurementUnit>pixel</MeasurementUnit>
-	<sourceImageInformation>
-	  <fileName>test1.png</fileName>
-	</sourceImageInformation>
-	<OCRProcessing ID="OCR_0">
-	  <ocrProcessingStep>
-		<processingSoftware>
-		  <softwareName>kraken</softwareName>
-		</processingSoftware>
-	  </ocrProcessingStep>
-	</OCRProcessing>
+    <MeasurementUnit>pixel</MeasurementUnit>
+    <sourceImageInformation>
+      <fileName>test1.png</fileName>
+    </sourceImageInformation>
+    <OCRProcessing ID="OCR_0">
+      <ocrProcessingStep>
+	<processingSoftware>
+	  <softwareName>kraken</softwareName>
+	</processingSoftware>
+      </ocrProcessingStep>
+    </OCRProcessing>
   </Description>
 
   <Layout>
     <Page WIDTH="850" HEIGHT="1083" PHYSICAL_IMG_NR="0" ID="page_0">
-	  <PrintSpace HPOS="0" VPOS="0" WIDTH="850" HEIGHT="1083">
-		<TextBlock HPOS="0" VPOS="0" ID="textblock_0" WIDTH="850" HEIGHT="1083">
-		  <TextLine ID="line_0"
-					HPOS="160"
-					VPOS="771" 
-					WIDTH="60" 
-					HEIGHT="28">
-			<String ID="segment_0"
-					CONTENT="This" 
-					HPOS="160" 
-					VPOS="771"
-					WIDTH="20" 
-					HEIGHT="28"></String>
+      <PrintSpace HPOS="0" VPOS="0" WIDTH="850" HEIGHT="1083">
+	<TextBlock HPOS="0" VPOS="0" ID="textblock_0" WIDTH="850" HEIGHT="1083">
+	  <TextLine ID="line_0"
+		    HPOS="160"
+		    VPOS="771"
+		    WIDTH="60"
+		    HEIGHT="28">
+	    <String ID="segment_0"
+		    CONTENT="This"
+		    HPOS="160"
+		    VPOS="771"
+		    WIDTH="20"
+		    HEIGHT="28"></String>
             <String ID="segment_1"
-					CONTENT="is" 
-					HPOS="185" 
-					VPOS="771"
-					WIDTH="10" 
-					HEIGHT="28"></String>
+		    CONTENT="is"
+		    HPOS="185"
+		    VPOS="771"
+		    WIDTH="10"
+		    HEIGHT="28"></String>
             <String ID="segment_2"
-					CONTENT="a" 
-					HPOS="195" 
-					VPOS="771"
-					WIDTH="5" 
-					HEIGHT="28"></String>
+		    CONTENT="a"
+		    HPOS="195"
+		    VPOS="771"
+		    WIDTH="5"
+		    HEIGHT="28"></String>
             <String ID="segment_3"
-					CONTENT="test" 
-					HPOS="200" 
-					VPOS="771"
-					WIDTH="20" 
-					HEIGHT="28"></String>
-		  </TextLine>
+		    CONTENT="test"
+		    HPOS="200"
+		    VPOS="771"
+		    WIDTH="20"
+		    HEIGHT="28"></String>
+	  </TextLine>
           <TextLine ID="line_1"
-					HPOS="160"
-					VPOS="800" 
-					WIDTH="25" 
-					HEIGHT="28">
+		    HPOS="160"
+		    VPOS="800"
+		    WIDTH="25"
+		    HEIGHT="28">
             <String ID="segment_1_0"
-					CONTENT="Line 2" 
-					HPOS="160" 
-					VPOS="771"
-					WIDTH="25" 
-					HEIGHT="28"></String>
+		    CONTENT="Line"
+		    HPOS="160"
+		    VPOS="771"
+		    WIDTH="25"
+		    HEIGHT="28"></String>
+            <String ID="segment_1_1"
+		    CONTENT="2"
+		    HPOS="185"
+		    VPOS="771"
+		    WIDTH="25"
+		    HEIGHT="28"></String>
           </TextLine>
           <TextLine ID="line_2"
-					HPOS="160"
-					VPOS="830" 
-					WIDTH="25" 
-					HEIGHT="28">
+		    HPOS="160"
+		    VPOS="830"
+		    WIDTH="25"
+		    HEIGHT="28">
             <String ID="segment_2_0"
-					CONTENT="Line 3" 
-					HPOS="160" 
-					VPOS="771"
-					WIDTH="25" 
-					HEIGHT="28"></String>
+		    CONTENT="3"
+		    HPOS="160"
+		    VPOS="771"
+		    WIDTH="25"
+		    HEIGHT="28"></String>
+            <String ID="segment_2_1"
+		    CONTENT="3"
+		    HPOS="185"
+		    VPOS="771"
+		    WIDTH="25"
+		    HEIGHT="28"></String>
           </TextLine>
         </TextBlock>
       </PrintSpace>
diff --git a/app/apps/imports/models.py b/app/apps/imports/models.py
index bc640695..0b7bd257 100644
--- a/app/apps/imports/models.py
+++ b/app/apps/imports/models.py
@@ -2,6 +2,7 @@ import os.path
 
 from django.core.validators import FileExtensionValidator
 from django.db import models
+from django.utils.translation import gettext as _
 
 from escriptorium.celery import app
 
@@ -68,8 +69,18 @@ class DocumentImport(models.Model):
         self.workflow_state = self.WORKFLOW_STATE_ERROR
         self.error_message = 'canceled'
         self.save()
-        if self.task_id:
-            app.control.revoke(self.task_id, terminate=True)
+        if self.report and self.report.task_id:
+            app.control.revoke(self.report.task_id, terminate=True)
+
+    def save(self, *args, **kwargs):
+        if not self.report:
+            # create a report and link to it
+            report = TaskReport.objects.create(
+                label=_('Import in %(document_name)s') % {'document_name': self.document.name},
+                user=self.started_by or self.document.owner)
+            self.report = report
+
+        super().save(*args, **kwargs)
 
     def process(self, resume=True):
         try:
diff --git a/app/apps/imports/parsers.py b/app/apps/imports/parsers.py
index 32830672..269bfab8 100644
--- a/app/apps/imports/parsers.py
+++ b/app/apps/imports/parsers.py
@@ -116,7 +116,7 @@ class ZipParser(ParserDocument):
                             part.save()
 
                         # xml
-                        elif file_extension == 'xml':
+                        elif file_extension in XML_EXTENSIONS:
                             parser = make_parser(self.document, zipedfh,
                                                  name=self.name, report=self.report)
 
@@ -386,6 +386,7 @@ The alto file should contain a Description/sourceImageInformation/fileName tag f
 
     def update_line(self, line, lineTag):
         baseline = lineTag.get("BASELINE")
+
         if baseline is not None:
             # sometimes baseline is just a single number,
             # an offset maybe it's not super clear
@@ -401,6 +402,13 @@ The alto file should contain a Description/sourceImageInformation/fileName tag f
                     logger.warning(msg)
                     if self.report:
                         self.report.append(msg)
+        else:
+            # extract it from <String>s then
+            strings = lineTag.findall("String", self.root.nsmap)
+            last_segment = strings[-1]
+            line.baseline = [(int(e.get('HPOS')), int(e.get('VPOS'))) for e in strings]
+            line.baseline.append((int(last_segment.get('HPOS'))+int(last_segment.get('WIDTH')),
+                                  int(last_segment.get('VPOS'))))
 
         polygon = lineTag.find("Shape/Polygon", self.root.nsmap)
         if polygon is not None:
diff --git a/app/apps/imports/tests.py b/app/apps/imports/tests.py
index 3abf724a..9a934275 100644
--- a/app/apps/imports/tests.py
+++ b/app/apps/imports/tests.py
@@ -11,6 +11,7 @@ from imports.models import DocumentImport
 from imports.parsers import AltoParser, IIIFManifestParser
 from core.models import Block, Line, Transcription, LineTranscription, BlockType, LineType
 from core.tests.factory import CoreFactoryTestCase
+from reporting.models import TaskReport
 
 
 class XmlImportTestCase(CoreFactoryTestCase):
@@ -36,7 +37,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
         filename = 'test_single.alto'
         mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename)
         with open(mock_path, 'rb') as fh:
-            with self.assertNumQueries(13):
+            with self.assertNumQueries(20):
                 response = self.client.post(uri, {
                     'upload_file': SimpleUploadedFile(filename, fh.read())
                 })
@@ -51,17 +52,14 @@ class XmlImportTestCase(CoreFactoryTestCase):
 
         # import was created since the form validated
         imp = DocumentImport.objects.first()
-        self.assertTrue(imp.error_message, 'No match found')
-
-        self.part1.original_filename = 'test1.png'
-        self.part1.save()
+        self.assertTrue(imp.report.messages.startswith('No match found for file import_src/test_single_'))
 
     def test_alto_invalid_xml(self):
         uri = reverse('api:document-imports', kwargs={'pk': self.document.pk})
         filename = 'test_invalid_alto.xml'
         mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename)
         with open(mock_path, 'rb') as fh:
-            with self.assertNumQueries(6):
+            with self.assertNumQueries(8):
                 response = self.client.post(uri, {
                     'upload_file': SimpleUploadedFile(filename, fh.read())
                 })
@@ -72,7 +70,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
         filename = 'test_single.alto'
         mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename)
         with open(mock_path, 'rb') as fh:
-            with self.assertNumQueries(47):
+            with self.assertNumQueries(54):
                 response = self.client.post(uri, {
                     'upload_file': SimpleUploadedFile(filename, fh.read())
                 })
@@ -86,7 +84,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
         self.assertEqual(self.part1.blocks.count(), 1)
         self.assertEqual(self.part1.blocks.first().box, [[0, 0], [850, 0], [850, 1083], [0, 1083]])
         self.assertEqual(self.part1.lines.count(), 3)
-        self.assertEqual(self.part1.lines.first().box, [160, 771, 220, 799])
+        self.assertEqual(self.part1.lines.first().box, [160, 771, 220, 771])
         self.assertEqual(self.part1.lines.first().transcriptions.first().content, 'This is a test')
         self.assertEqual(self.part2.blocks.count(), 0)
         self.assertEqual(self.part2.lines.count(), 0)
@@ -96,7 +94,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
         filename = 'test_single_baselines.alto'
         mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename)
         with open(mock_path, 'rb') as fh:
-            with self.assertNumQueries(33):
+            with self.assertNumQueries(40):
                 response = self.client.post(uri, {
                     'upload_file': SimpleUploadedFile(filename, fh.read())
                 })
@@ -120,7 +118,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
         filename = 'test.zip'
         mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename)
         with open(mock_path, 'rb') as fh:
-            with self.assertNumQueries(65):
+            with self.assertNumQueries(72):
                 response = self.client.post(uri, {
                     'upload_file': SimpleUploadedFile(filename, fh.read())
                 })
@@ -130,6 +128,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
         self.assertEqual(DocumentImport.objects.count(), 1)
         self.assertEqual(DocumentImport.objects.first().workflow_state,
                          DocumentImport.WORKFLOW_STATE_DONE)
+
         self.assertEqual(self.part1.blocks.count(), 1)
         self.assertEqual(self.part1.lines.count(), 3)
         self.assertEqual(self.part2.blocks.count(), 1)
@@ -144,7 +143,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
         filename = 'test_composedblock.alto'
         mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename)
         with open(mock_path, 'rb') as fh:
-            with self.assertNumQueries(59):
+            with self.assertNumQueries(66):
                 response = self.client.post(uri, {
                     'upload_file': SimpleUploadedFile(filename, fh.read())
                 })
@@ -170,6 +169,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
         with open(mock_path, 'rb') as fh:
             imp = DocumentImport.objects.create(
                 document=self.document,
+                started_by=self.document.owner,
                 import_file=ContentFile(
                     fh.read(),
                     name=os.path.join(
@@ -189,7 +189,8 @@ class XmlImportTestCase(CoreFactoryTestCase):
         b = Block.objects.create(document_part=self.part1, external_id="textblock_0",
                                  box=[[0, 0], [100, 100]])
         l = Line.objects.create(document_part=self.part1, block=b, external_id="line_0",
-                                box=[10, 10, 50, 20])
+                                baseline=((5, 5), (5, 10)),
+                                mask=((0, 0), (0, 10), (10, 0), (10,10)))
         lt = LineTranscription.objects.create(transcription=trans, line=l)
 
         uri = reverse('api:document-imports', kwargs={'pk': self.document.pk})
@@ -221,12 +222,13 @@ class XmlImportTestCase(CoreFactoryTestCase):
         b = Block.objects.create(document_part=self.part1, external_id="textblock_0",
                                  box=[[0, 0], [100, 100]])
         l = Line.objects.create(document_part=self.part1, block=b, external_id="line_0",
-                                box=[10, 10, 50, 20])
+                                baseline=((5, 5), (5, 10)),
+                                mask=((0, 0), (0, 10), (10, 0), (10,10)))
         lt = LineTranscription.objects.create(transcription=trans, line=l, content="test history")
 
         # historic line without external_id
         b2 = Block.objects.create(document_part=self.part1, box=[[0, 0], [100, 100]])
-        l2 = Line.objects.create(document_part=self.part1, block=b2, box=[10, 10, 50, 20])
+        l2 = Line.objects.create(document_part=self.part1, block=b2, baseline=((10, 10), (50, 20)))
         lt2 = LineTranscription.objects.create(transcription=trans, line=l2, content="test dummy")
 
         uri = reverse('api:document-imports', kwargs={'pk': self.document.pk})
@@ -263,7 +265,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
         filename = 'pagexml_test.xml'
         mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename)
         with open(mock_path, 'rb') as fh:
-            with self.assertNumQueries(13):
+            with self.assertNumQueries(20):
                 response = self.client.post(uri, {'upload_file': SimpleUploadedFile(filename,
                                                                                     fh.read())})
                 # Note: the ParseError is raised by the processing of the import,
@@ -276,7 +278,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
         self.assertEqual(self.part3.lines.count(), 0)
         # import was created since the form validated
         imp = DocumentImport.objects.first()
-        self.assertTrue(imp.error_message, 'No match found')
+        self.assertTrue(imp.report.messages.startswith('No match found'))
         self.part3.original_filename = 'test3.png'
         self.part3.save()
 
@@ -285,7 +287,8 @@ class XmlImportTestCase(CoreFactoryTestCase):
         block = Block.objects.create(document_part=self.part3, external_id="r2",
                                      box=[[0, 0], [100, 100]])
         l = Line.objects.create(document_part=self.part3, block=block, external_id="r2l1",
-                                box=[10, 10, 50, 20])
+                                baseline=((5, 5), (5, 10)),
+                                mask=((0, 0), (0, 10), (10, 0), (10,10)))
         lt = LineTranscription.objects.create(transcription=trans, line=l)
 
         uri = reverse('api:document-imports', kwargs={'pk': self.document.pk})
@@ -307,8 +310,11 @@ class XmlImportTestCase(CoreFactoryTestCase):
         trans = Transcription.objects.create(name="test import", document=self.document)
         block = Block.objects.create(document_part=self.part3, external_id="r2",
                                      box=[[0, 0], [100, 100]])
-        line = Line.objects.create(document_part=self.part3, block=block, external_id="r2l1",
-                                   box=[10, 10, 50, 20])
+        line = Line.objects.create(document_part=self.part3,
+                                   block=block,
+                                   external_id="r2l1",
+                                   baseline=((5, 5), (5, 10)),
+                                   mask=((0, 0), (0, 10), (10, 0), (10,10)))
         lt = LineTranscription.objects.create(transcription=trans, line=line)
 
         uri = reverse('api:document-imports', kwargs={'pk': self.document.pk})
@@ -333,7 +339,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
         filename = 'test_pagexml.zip'
         mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename)
         with open(mock_path, 'rb') as fh:
-            with self.assertNumQueries(406):  # theres a lot of lines in there
+            with self.assertNumQueries(413):  # theres a lot of lines in there
                 response = self.client.post(uri, {
                     'upload_file': SimpleUploadedFile(filename, fh.read())
                 })
@@ -362,7 +368,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
         filename = 'test_pagexml_types.xml'
         mock_path = os.path.join(os.path.dirname(__file__), 'mocks', filename)
         with open(mock_path, 'rb') as fh:
-            with self.assertNumQueries(67):
+            with self.assertNumQueries(74):
                 response = self.client.post(uri, {
                     'upload_file': SimpleUploadedFile(filename, fh.read())
                 })
@@ -398,7 +404,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
 
             # we don't go through the form but we want to test json validation
             fh.seek(0)
-            IIIFManifestParser(self.document, fh).validate()
+            IIIFManifestParser(self.document, fh, imp.report).validate()
 
             imp.save()
 
@@ -424,6 +430,7 @@ class XmlImportTestCase(CoreFactoryTestCase):
             document=self.document,
             import_file=ContentFile('', name='doesntmatter.xml'),
             workflow_state=DocumentImport.WORKFLOW_STATE_STARTED,
+            started_by=self.document.owner,
             processed=0)
         uri = reverse('api:document-cancel-import', kwargs={'pk': self.document.pk})
         response = self.client.post(uri)
@@ -447,7 +454,8 @@ class DocumentExportTestCase(CoreFactoryTestCase):
             self.parts.append(part)
             for j in range(1, 4):
                 l = Line.objects.create(document_part=part,
-                                        box=(0, 0, 1, 1))
+                                        baseline=((5, 5), (5, 10)),
+                                        mask=((0, 0), (0, 10), (10, 0), (10,10)))
                 LineTranscription.objects.create(
                     line=l,
                     transcription=self.trans,
@@ -455,7 +463,7 @@ class DocumentExportTestCase(CoreFactoryTestCase):
 
     def test_simple(self):
         self.client.force_login(self.user)
-        with self.assertNumQueries(10):
+        with self.assertNumQueries(17):
             response = self.client.post(reverse('api:document-export',
                                                 kwargs={'pk': self.trans.document.pk}),
                                         {'transcription': self.trans.pk,
@@ -468,7 +476,7 @@ class DocumentExportTestCase(CoreFactoryTestCase):
 
     def test_alto(self):
         self.client.force_login(self.user)
-        with self.assertNumQueries(19):
+        with self.assertNumQueries(28):
             response = self.client.post(reverse('api:document-export',
                                                 kwargs={'pk': self.trans.document.pk}),
                                         {'transcription': self.trans.pk,
@@ -485,13 +493,14 @@ class DocumentExportTestCase(CoreFactoryTestCase):
             for j in range(1, 4):
                 l = Line.objects.create(document_part=part,
                                         block=block,
-                                        box=(0, 0, 1, 1))
+                                        baseline=((5, 5), (5, 10)),
+                                        mask=((0, 0), (0, 10), (10, 0), (10,10)))
                 LineTranscription.objects.create(
                     line=l,
                     transcription=self.trans,
                     content='line %d:%d' % (i, j))
         self.client.force_login(self.user)
-        with self.assertNumQueries(10):
+        with self.assertNumQueries(17):
             response = self.client.post(reverse('api:document-export',
                                                 kwargs={'pk': self.trans.document.pk}),
                                         {'transcription': self.trans.pk,
-- 
GitLab