vidjilparser.py 6.67 KB
Newer Older
1
#!/usr/bin/python
2
import ijson.backends.yajl2_cffi as ijson
3
from six import string_types
4 5 6 7 8
from enum import Enum

class MatchingEvent(Enum):
    end_map = "start_map"
    end_array = "start_array"
9 10 11

class VidjilWriter(object):

12
    def __init__(self, pretty=False):
13
        self.pretty = pretty
14
        self.buffer = []
15 16
        self.buffering = False
        self.conserveBuffer = False
17 18 19 20 21

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
22
        pass
23 24

    def write(self, prefix, event, value, previous):
25 26
        res = self._write(prefix, event, value, previous)
        if self.buffering:
27
            self.buffer.append(res)
28
            return ""
29
        return res
30 31

    def _write(self, prefix, event, value, previous):
32 33 34 35 36 37 38
        if self.pretty:
            end = '\n'
        else:
            end = ''
        if event == 'start_map':
            mstr = '{{'
        elif event == 'end_map':
39
            mstr = '}}'
40 41 42
        elif event == 'start_array':
            mstr = '['
        elif event == 'end_array':
43
            mstr = ']'
44
        elif event == 'map_key':
45
            mstr = '\"{}\":'
46 47
            end = ''
        elif event == 'string':
48
            mstr = '\"{}\"'
49
        else:
50 51
            if event == 'boolean':
                value = str(value).lower()
52 53 54 55
            if value is None:
                mstr = 'null'
            else:
                mstr = '{}'
56
        padding = ''
57
        if isinstance(value, string_types) :
58 59 60 61
            value = value.replace("\n", "\\n")
            value = value.replace("\r", "\\r")
        if previous not in ['', 'map_key', 'start_map', 'start_array'] and event not in ['end_map', 'end_array']:
            mstr = "," + mstr
62
        if self.pretty and previous != 'map_key':
63 64 65
            if len(prefix) > 0:
                padding = ''.join(['\t' for i in range(len(prefix.split('.')))])
        mstr = '{}' + mstr + end
66 67 68
        return mstr.format(padding, value)

    def purgeBuffer(self):
69
        self.buffer = []
70 71 72 73
        self.conserveBuffer = False

    def writeBuffer(self):
        try:
74
            return ''.join(self.buffer)
75 76 77 78
        finally:
            self.purgeBuffer()

    def startBuffering(self):
79
        self.conserveBuffer = False
80 81 82 83 84 85 86 87 88 89 90
        self.buffering = True

    def endBuffering(self):
        self.buffering = False
        if self.conserveBuffer:
            self.conserveBuffer = False
            return self.writeBuffer()
        else :
            self.purgeBuffer()
        return ""

91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
class VidjilFileWriter(VidjilWriter):

    def __init__(self, filepath=None, pretty=False):
        super(VidjilWriter, self).__init__()
        self._filepath = filepath
        self.file = None

    def __enter__(self):
        self.file = open(self._filepath, 'wb')
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.file.close()

    def write(self, prefix, event, value, previous):
        res = super(VidjilWriter, self).write(prefix, event, value, previous)
        self.file.write(res)
        return res

    def writeBuffer(self):
        res = super(VidjilWriter, self).writeBuffer()
        self.file.write(res)
        return res

115 116 117 118 119 120 121 122 123
class Predicate(object):

    def __init__(self, field, comparator, value):
        self.comp = comparator
        self.field = field
        self.value = value

    def compare(self, field, other):
        if self.comp is None:
124
            return field == self.field
125
        try:
126 127
            res = field == self.field and self.comp(other, self.value)
            return res
128 129
        except:
            return False
130 131 132 133 134 135 136 137


class VidjilParser(object):

    def __init__(self, writer=None):
        if writer is not None:
            self._writer = writer
        else:
138
            self._writer = VidjilWriter()
139
        self._model_prefixes = []
140
        self.prefixes = []
141 142

    def initModel(self, model_path):
143
        with open(model_path, 'rb') as model:
144 145 146 147 148 149
            parser = ijson.parse(model)
            for prefix, event, value in parser:
                if (prefix, event) not in self._model_prefixes:
                    self._model_prefixes.append((prefix, event))

    def validate(self, filepath):
150
        with open(filepath, 'rb') as vfile:
151 152 153 154 155 156 157 158 159 160 161
            parser = ijson.parse(vfile)
            model = list(self._model_prefixes)
            for prefix, event, value in parser:
                pair = (prefix, event)
                if pair in model:
                    model.remove(pair)
            return len(model) == 0

    def writer(self):
        return self._writer

162
    def addPrefix(self, prefix, conditional = None, comp = None, value = None):
163 164
        if conditional is None:
            conditional = prefix
165
        self.prefixes.append((prefix, Predicate(conditional, comp, value)))
166

167 168 169 170
    def reset(self):
        self.prefixes = []
        self._writer.purgeBuffer()

171
    def extract(self, filepath):
172
        vidjilfile = open(filepath, 'rb')
173 174
        parser = ijson.parse(vidjilfile)
        with self.writer() as writer:
175
            return self._extract(parser, writer)
176

177 178 179 180 181 182 183 184 185 186 187
    def isStartEvent(self, event):
        return event in ['start_map', 'start_array']

    def isEndEvent(self, event):
        return event in ['end_map', 'end_array', 'number', 'string', 'boolean']

    def isMatching(self, mbuffer, other):
        if other[1] not in MatchingEvent.__members__:
            return False
        return (mbuffer[0] == other[0]) and (mbuffer[1] == MatchingEvent[other[1]].value)

188 189
    def _extract(self, parser, writer):
        previous = ''
190
        res = ""
191
        bufferStart = (None, None)
192
        for prefix, event, value in parser:
193 194 195 196 197
            subelem = lambda x, y: x.startswith(y)
            if any(subelem(prefix, item[0])\
                    or (subelem(item[0], prefix) and (value is None or subelem(item[0], str(value))))\
                    for item in self.prefixes):

198 199 200 201 202 203
                bufferOn = any(prefix == item[0] for item in self.prefixes) and self.isStartEvent(event)
                if bufferOn:
                    bufferStart = (prefix, event)
                    saved_previous = previous
                    self._writer.startBuffering()

204 205 206 207
                if not self._writer.conserveBuffer \
                        and any((item[1].compare(prefix, value)) for item in self.prefixes):
                    self._writer.conserveBuffer = True

208
                res += writer.write(prefix, event, value, previous)
209

210
                previous = event
211
                if (self.writer().buffering and (self.isEndEvent(event) and self.isMatching(bufferStart, (prefix, event)) or self._writer.conserveBuffer)):
212 213
                    if not self._writer.conserveBuffer:
                        previous = saved_previous
214
                    res += self._writer.endBuffering()
215
        return res