Attention une mise à jour du service Gitlab va être effectuée le mardi 30 novembre entre 17h30 et 18h00. Cette mise à jour va générer une interruption du service dont nous ne maîtrisons pas complètement la durée mais qui ne devrait pas excéder quelques minutes. Cette mise à jour intermédiaire en version 14.0.12 nous permettra de rapidement pouvoir mettre à votre disposition une version plus récente.

Commit 3e92df67 authored by Robin Tissot's avatar Robin Tissot
Browse files

Merge branch 'feature/pdf' into 'develop'

Add PDF Import option.

See merge request !50
parents 0b35f6b1 3cec329b
......@@ -23,7 +23,7 @@ RUN apt-get update
RUN addgroup --system uwsgi
RUN adduser --system --no-create-home --ingroup uwsgi uwsgi
RUN apt-get install netcat-traditional jpegoptim pngcrush
RUN apt-get install netcat-traditional jpegoptim pngcrush libvips
RUN pip install --upgrade pip
......
......@@ -5,8 +5,8 @@ import re
import requests
import time
import uuid
import warnings
import zipfile
import pyvips
from lxml import etree
from django.conf import settings
......@@ -66,6 +66,47 @@ class ParserDocument:
return transcription
class PdfParser(ParserDocument):
def validate(self):
try:
self.doc = pyvips.Image.new_from_buffer(self.file.read(), "",
dpi=300, n=-1, access="sequential")
except pyvips.error.Error as e:
logger.exception(e)
raise ParseError(_("Invalid pdf file."))
@property
def total(self):
if 'n-pages' in self.doc.get_fields():
return self.doc.get('n-pages')
else:
return 0
def parse(self, start_at=0, override=False, user=None):
self.doc = pyvips.Image.new_from_buffer(self.file.read(), "",
dpi=300, n=-1, access="sequential")
try:
self.doc.flatten(background=255)
n_pages = self.doc.get('n-pages')
page_width = self.doc.width
page_height = self.doc.height / n_pages
for i in range(0, n_pages):
page = self.doc.crop(0, i * page_height, page_width, page_height)
part = DocumentPart(document=self.document)
part.image.save('%s_page_%d.png' % (self.file.name, i+1),
ContentFile(page.write_to_buffer('.png')))
part.save()
yield part
except pyvips.error.Error as e:
msg = _("Parse error in {filename}: {error}, skipping it.").format(
filename=self.file.name, error=e.args[0]
)
logger.warning(msg)
if self.report:
self.report.append(msg)
class ZipParser(ParserDocument):
"""
For now only deals with a flat list of Alto files
......@@ -684,6 +725,8 @@ def make_parser(document, file_handler, name=None, report=None):
return IIIFManifestParser(document, file_handler, report)
elif ext == "zip":
return ZipParser(document, file_handler, report, transcription_name=name)
elif ext == "pdf":
return PdfParser(document, file_handler, report)
else:
raise ValueError(
"Invalid extension for the file to be parsed %s." % file_handler.name
......
......@@ -30,7 +30,8 @@
<i class="fas fa-file-import mr-1"></i>{% trans "Import" %}
</button>
<div class="dropdown-menu">
<a data-proc="import-iiif" class="js-proc-selected dropdown-item" href="#">Images (IIIF)</a>
<a data-proc="import-iiif" class="js-proc-selected dropdown-item" href="#">Images (IIIF)</a>
<a data-proc="import-pdf" class="js-proc-selected dropdown-item" href="#">Images (PDF)</a>
<a data-proc="import-xml" class="js-proc-selected dropdown-item" href="#">Transcriptions (XML)</a>
</div>
</div>
......@@ -103,6 +104,7 @@
{# Process wizards #}
{% include 'core/wizards/import_iiif.html' with proc='import-iiif' %}
{% include 'core/wizards/import_file.html' with proc='import-xml' %}
{% include 'core/wizards/import_pdf.html' with proc='import-pdf' %}
{% include 'core/wizards/export.html' with proc='export' %}
{% include 'core/wizards/binarize.html' with proc='binarize' %}
{% include 'core/wizards/segment.html' with proc='segment' %}
......
{% extends 'core/wizards/import.html' %}
{% load i18n bootstrap %}
{% block selected_images %}{% endblock %}
{% block wizard_action %}action="{% url 'api:document-imports' pk=object.pk %}"{% endblock %}
{% block wizard_fields %}
{{ block.super }}
{% if not import_form.current_import.ongoing %}
<h5>{% trans "Import images from a PDF document." %}</h5>
<div class="form-group">
{% render_field import_form.upload_file class="js-proc-settings" accept=".pdf" %}
</div>
{% endif %}
{% endblock %}
{% block wizard_submit %}{% trans "Start importing" %}{% endblock %}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment