vidjilparser.py 5.36 KB
Newer Older
1 2
#!/usr/bin/python
import ijson
3
from six import string_types
4 5 6 7 8 9

class VidjilWriter(object):

    def __init__(self, filepath, pretty=False):
        self._filepath = filepath
        self.pretty = pretty
10 11 12
        self.buffer = ""
        self.buffering = False
        self.conserveBuffer = False
13 14 15 16 17 18 19 20 21

    def __enter__(self):
        self.file = open(self._filepath, 'w')
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.file.close()

    def write(self, prefix, event, value, previous):
22 23 24 25 26 27 28 29 30
        res = self._write(prefix, event, value, previous)
        if self.buffering:
            self.buffer += res
            return ""
        else:
            self.file.write(res)
            return res

    def _write(self, prefix, event, value, previous):
31 32 33 34 35 36 37
        if self.pretty:
            end = '\n'
        else:
            end = ''
        if event == 'start_map':
            mstr = '{{'
        elif event == 'end_map':
38
            mstr = '}}'
39 40 41
        elif event == 'start_array':
            mstr = '['
        elif event == 'end_array':
42
            mstr = ']'
43
        elif event == 'map_key':
44
            mstr = '\"{}\":'
45 46
            end = ''
        elif event == 'string':
47
            mstr = '\"{}\"'
48
        else:
49 50 51
            if event == 'boolean':
                value = str(value).lower()
            mstr = '{}'
52
        padding = ''
53
        if isinstance(value, string_types) :
54 55 56 57
            value = value.replace("\n", "\\n")
            value = value.replace("\r", "\\r")
        if previous not in ['', 'map_key', 'start_map', 'start_array'] and event not in ['end_map', 'end_array']:
            mstr = "," + mstr
58
        if self.pretty and previous != 'map_key':
59 60 61
            if len(prefix) > 0:
                padding = ''.join(['\t' for i in range(len(prefix.split('.')))])
        mstr = '{}' + mstr + end
62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
        return mstr.format(padding, value)

    def purgeBuffer(self):
        self.buffer = ""
        self.conserveBuffer = False

    def writeBuffer(self):
        try:
            self.file.write(self.buffer)
            return self.buffer
        finally:
            self.purgeBuffer()

    def startBuffering(self):
        self.buffering = True

    def endBuffering(self):
        self.buffering = False
        if self.conserveBuffer:
            self.conserveBuffer = False
            return self.writeBuffer()
        else :
            self.purgeBuffer()
        return ""

87 88 89 90 91 92 93 94 95 96 97 98 99 100
class Predicate(object):

    def __init__(self, field, comparator, value):
        self.comp = comparator
        self.field = field
        self.value = value

    def compare(self, field, other):
        if self.comp is None:
            return True
        try:
            return field == self. field and self.comp(other, self.value)
        except:
            return False
101 102 103 104 105 106 107 108 109 110


class VidjilParser(object):

    def __init__(self, writer=None):
        if writer is not None:
            self._writer = writer
        else:
            self._writer = VidjilWriter('out.json')
        self._model_prefixes = []
111
        self.prefixes = []
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132

    def initModel(self, model_path):
        with open(model_path, 'r') as model:
            parser = ijson.parse(model)
            for prefix, event, value in parser:
                if (prefix, event) not in self._model_prefixes:
                    self._model_prefixes.append((prefix, event))

    def validate(self, filepath):
        with open(filepath, 'r') as vfile:
            parser = ijson.parse(vfile)
            model = list(self._model_prefixes)
            for prefix, event, value in parser:
                pair = (prefix, event)
                if pair in model:
                    model.remove(pair)
            return len(model) == 0

    def writer(self):
        return self._writer

133 134
    def addPrefix(self, prefix, conditional = None, comp = None, value = None):
        self.prefixes.append((prefix, Predicate(conditional, comp, value)))
135

136 137 138 139
    def reset(self):
        self.prefixes = []
        self._writer.purgeBuffer()

140 141 142 143
    def extract(self, filepath):
        vidjilfile = open(filepath, 'r')
        parser = ijson.parse(vidjilfile)
        with self.writer() as writer:
144
            return self._extract(parser, writer)
145 146 147

    def _extract(self, parser, writer):
        previous = ''
148
        res = ""
149 150
        for prefix, event, value in parser:
            #There must be a better way !!!
151 152
            cond = any(prefix.startswith(item[0]) for item in self.prefixes) \
                    or (any(item[0].startswith(prefix) for item in self.prefixes) \
153
                        and (value is None \
154 155
                            or any(item[0].startswith(prefix + '.' + str(value)) for item in self.prefixes) \
                                or any(item[0].startswith(str(value)) for item in self.prefixes)))
156
            if cond:
157 158 159 160 161 162 163 164
                if not self._writer.conserveBuffer \
                        and any((item[1].compare(prefix, value)) for item in self.prefixes):
                    self._writer.conserveBuffer = True

                bufferOn = any(prefix == item[0] or prefix == item[0]+'.item' for item in self.prefixes)
                if bufferOn and event == "start_map":
                    self._writer.startBuffering()

165
                res += writer.write(prefix, event, value, previous)
166 167 168 169

                if bufferOn and event == "end_map":
                    res += self._writer.endBuffering()
                previous = event
170
        return res