vidjilparser.py 6.19 KB
Newer Older
1 2
#!/usr/bin/python
import ijson
3
from six import string_types
4 5 6 7 8
from enum import Enum

class MatchingEvent(Enum):
    end_map = "start_map"
    end_array = "start_array"
9 10 11

class VidjilWriter(object):

12
    def __init__(self, filepath=None, pretty=False):
13 14
        self._filepath = filepath
        self.pretty = pretty
15 16 17
        self.buffer = ""
        self.buffering = False
        self.conserveBuffer = False
18
        self.file = None
19 20

    def __enter__(self):
21 22
        if self._filepath:
            self.file = open(self._filepath, 'w')
23 24 25
        return self

    def __exit__(self, exc_type, exc_value, traceback):
26 27
        if self.file:
            self.file.close()
28 29

    def write(self, prefix, event, value, previous):
30 31 32 33
        res = self._write(prefix, event, value, previous)
        if self.buffering:
            self.buffer += res
            return ""
34
        elif self.file:
35
            self.file.write(res)
36
        return res
37 38

    def _write(self, prefix, event, value, previous):
39 40 41 42 43 44 45
        if self.pretty:
            end = '\n'
        else:
            end = ''
        if event == 'start_map':
            mstr = '{{'
        elif event == 'end_map':
46
            mstr = '}}'
47 48 49
        elif event == 'start_array':
            mstr = '['
        elif event == 'end_array':
50
            mstr = ']'
51
        elif event == 'map_key':
52
            mstr = '\"{}\":'
53 54
            end = ''
        elif event == 'string':
55
            mstr = '\"{}\"'
56
        else:
57 58 59
            if event == 'boolean':
                value = str(value).lower()
            mstr = '{}'
60
        padding = ''
61
        if isinstance(value, string_types) :
62 63 64 65
            value = value.replace("\n", "\\n")
            value = value.replace("\r", "\\r")
        if previous not in ['', 'map_key', 'start_map', 'start_array'] and event not in ['end_map', 'end_array']:
            mstr = "," + mstr
66
        if self.pretty and previous != 'map_key':
67 68 69
            if len(prefix) > 0:
                padding = ''.join(['\t' for i in range(len(prefix.split('.')))])
        mstr = '{}' + mstr + end
70 71 72 73 74 75 76 77
        return mstr.format(padding, value)

    def purgeBuffer(self):
        self.buffer = ""
        self.conserveBuffer = False

    def writeBuffer(self):
        try:
78 79
            if self.file:
                self.file.write(self.buffer)
80 81 82 83 84
            return self.buffer
        finally:
            self.purgeBuffer()

    def startBuffering(self):
85
        self.conserveBuffer = False
86 87 88 89 90 91 92 93 94 95 96
        self.buffering = True

    def endBuffering(self):
        self.buffering = False
        if self.conserveBuffer:
            self.conserveBuffer = False
            return self.writeBuffer()
        else :
            self.purgeBuffer()
        return ""

97 98 99 100 101 102 103 104 105
class Predicate(object):

    def __init__(self, field, comparator, value):
        self.comp = comparator
        self.field = field
        self.value = value

    def compare(self, field, other):
        if self.comp is None:
106
            return field == self.field
107
        try:
108 109
            res = field == self.field and self.comp(other, self.value)
            return res
110 111
        except:
            return False
112 113 114 115 116 117 118 119


class VidjilParser(object):

    def __init__(self, writer=None):
        if writer is not None:
            self._writer = writer
        else:
120
            self._writer = VidjilWriter()
121
        self._model_prefixes = []
122
        self.prefixes = []
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143

    def initModel(self, model_path):
        with open(model_path, 'r') as model:
            parser = ijson.parse(model)
            for prefix, event, value in parser:
                if (prefix, event) not in self._model_prefixes:
                    self._model_prefixes.append((prefix, event))

    def validate(self, filepath):
        with open(filepath, 'r') as vfile:
            parser = ijson.parse(vfile)
            model = list(self._model_prefixes)
            for prefix, event, value in parser:
                pair = (prefix, event)
                if pair in model:
                    model.remove(pair)
            return len(model) == 0

    def writer(self):
        return self._writer

144
    def addPrefix(self, prefix, conditional = None, comp = None, value = None):
145 146
        if conditional is None:
            conditional = prefix
147
        self.prefixes.append((prefix, Predicate(conditional, comp, value)))
148

149 150 151 152
    def reset(self):
        self.prefixes = []
        self._writer.purgeBuffer()

153 154 155 156
    def extract(self, filepath):
        vidjilfile = open(filepath, 'r')
        parser = ijson.parse(vidjilfile)
        with self.writer() as writer:
157
            return self._extract(parser, writer)
158

159 160 161 162 163 164 165 166 167 168 169
    def isStartEvent(self, event):
        return event in ['start_map', 'start_array']

    def isEndEvent(self, event):
        return event in ['end_map', 'end_array', 'number', 'string', 'boolean']

    def isMatching(self, mbuffer, other):
        if other[1] not in MatchingEvent.__members__:
            return False
        return (mbuffer[0] == other[0]) and (mbuffer[1] == MatchingEvent[other[1]].value)

170 171
    def _extract(self, parser, writer):
        previous = ''
172
        res = ""
173
        bufferStart = (None, None)
174
        for prefix, event, value in parser:
175 176 177 178 179
            subelem = lambda x, y: x.startswith(y)
            if any(subelem(prefix, item[0])\
                    or (subelem(item[0], prefix) and (value is None or subelem(item[0], str(value))))\
                    for item in self.prefixes):

180 181 182 183 184 185
                bufferOn = any(prefix == item[0] for item in self.prefixes) and self.isStartEvent(event)
                if bufferOn:
                    bufferStart = (prefix, event)
                    saved_previous = previous
                    self._writer.startBuffering()

186 187 188 189
                if not self._writer.conserveBuffer \
                        and any((item[1].compare(prefix, value)) for item in self.prefixes):
                    self._writer.conserveBuffer = True

190
                res += writer.write(prefix, event, value, previous)
191

192
                previous = event
193
                if (self.writer().buffering and (self.isEndEvent(event) and self.isMatching(bufferStart, (prefix, event)) or self._writer.conserveBuffer)):
194 195
                    if not self._writer.conserveBuffer:
                        previous = saved_previous
196
                    res += self._writer.endBuffering()
197
        return res