vidjilparser.py 5.47 KB
Newer Older
1 2
#!/usr/bin/python
import ijson
3
from six import string_types
4 5 6

class VidjilWriter(object):

7
    def __init__(self, filepath=None, pretty=False):
8 9
        self._filepath = filepath
        self.pretty = pretty
10 11 12
        self.buffer = ""
        self.buffering = False
        self.conserveBuffer = False
13
        self.file = None
14 15

    def __enter__(self):
16 17
        if self._filepath:
            self.file = open(self._filepath, 'w')
18 19 20
        return self

    def __exit__(self, exc_type, exc_value, traceback):
21 22
        if self.file:
            self.file.close()
23 24

    def write(self, prefix, event, value, previous):
25 26 27 28
        res = self._write(prefix, event, value, previous)
        if self.buffering:
            self.buffer += res
            return ""
29
        elif self.file:
30
            self.file.write(res)
31
        return res
32 33

    def _write(self, prefix, event, value, previous):
34 35 36 37 38 39 40
        if self.pretty:
            end = '\n'
        else:
            end = ''
        if event == 'start_map':
            mstr = '{{'
        elif event == 'end_map':
41
            mstr = '}}'
42 43 44
        elif event == 'start_array':
            mstr = '['
        elif event == 'end_array':
45
            mstr = ']'
46
        elif event == 'map_key':
47
            mstr = '\"{}\":'
48 49
            end = ''
        elif event == 'string':
50
            mstr = '\"{}\"'
51
        else:
52 53 54
            if event == 'boolean':
                value = str(value).lower()
            mstr = '{}'
55
        padding = ''
56
        if isinstance(value, string_types) :
57 58 59 60
            value = value.replace("\n", "\\n")
            value = value.replace("\r", "\\r")
        if previous not in ['', 'map_key', 'start_map', 'start_array'] and event not in ['end_map', 'end_array']:
            mstr = "," + mstr
61
        if self.pretty and previous != 'map_key':
62 63 64
            if len(prefix) > 0:
                padding = ''.join(['\t' for i in range(len(prefix.split('.')))])
        mstr = '{}' + mstr + end
65 66 67 68 69 70 71 72
        return mstr.format(padding, value)

    def purgeBuffer(self):
        self.buffer = ""
        self.conserveBuffer = False

    def writeBuffer(self):
        try:
73 74
            if self.file:
                self.file.write(self.buffer)
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
            return self.buffer
        finally:
            self.purgeBuffer()

    def startBuffering(self):
        self.buffering = True

    def endBuffering(self):
        self.buffering = False
        if self.conserveBuffer:
            self.conserveBuffer = False
            return self.writeBuffer()
        else :
            self.purgeBuffer()
        return ""

91 92 93 94 95 96 97 98 99 100 101 102 103 104
class Predicate(object):

    def __init__(self, field, comparator, value):
        self.comp = comparator
        self.field = field
        self.value = value

    def compare(self, field, other):
        if self.comp is None:
            return True
        try:
            return field == self. field and self.comp(other, self.value)
        except:
            return False
105 106 107 108 109 110 111 112


class VidjilParser(object):

    def __init__(self, writer=None):
        if writer is not None:
            self._writer = writer
        else:
113
            self._writer = VidjilWriter()
114
        self._model_prefixes = []
115
        self.prefixes = []
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136

    def initModel(self, model_path):
        with open(model_path, 'r') as model:
            parser = ijson.parse(model)
            for prefix, event, value in parser:
                if (prefix, event) not in self._model_prefixes:
                    self._model_prefixes.append((prefix, event))

    def validate(self, filepath):
        with open(filepath, 'r') as vfile:
            parser = ijson.parse(vfile)
            model = list(self._model_prefixes)
            for prefix, event, value in parser:
                pair = (prefix, event)
                if pair in model:
                    model.remove(pair)
            return len(model) == 0

    def writer(self):
        return self._writer

137 138
    def addPrefix(self, prefix, conditional = None, comp = None, value = None):
        self.prefixes.append((prefix, Predicate(conditional, comp, value)))
139

140 141 142 143
    def reset(self):
        self.prefixes = []
        self._writer.purgeBuffer()

144 145 146 147
    def extract(self, filepath):
        vidjilfile = open(filepath, 'r')
        parser = ijson.parse(vidjilfile)
        with self.writer() as writer:
148
            return self._extract(parser, writer)
149 150 151

    def _extract(self, parser, writer):
        previous = ''
152
        res = ""
153 154
        for prefix, event, value in parser:
            #There must be a better way !!!
155 156
            cond = any(prefix.startswith(item[0]) for item in self.prefixes) \
                    or (any(item[0].startswith(prefix) for item in self.prefixes) \
157
                        and (value is None \
158 159
                            or any(item[0].startswith(prefix + '.' + str(value)) for item in self.prefixes) \
                                or any(item[0].startswith(str(value)) for item in self.prefixes)))
160
            if cond:
161 162 163 164 165 166 167 168
                if not self._writer.conserveBuffer \
                        and any((item[1].compare(prefix, value)) for item in self.prefixes):
                    self._writer.conserveBuffer = True

                bufferOn = any(prefix == item[0] or prefix == item[0]+'.item' for item in self.prefixes)
                if bufferOn and event == "start_map":
                    self._writer.startBuffering()

169
                res += writer.write(prefix, event, value, previous)
170 171 172 173

                if bufferOn and event == "end_map":
                    res += self._writer.endBuffering()
                previous = event
174
        return res