Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 275392ee authored by Duc Cao's avatar Duc Cao
Browse files

Speed up Extractor by eliminating the empy rows/columns in BoundaryFinder

parent fd8e7b16
No related branches found
No related tags found
No related merge requests found
...@@ -88,7 +88,7 @@ class BoundaryFinder(): ...@@ -88,7 +88,7 @@ class BoundaryFinder():
logger.debug('__identify_extractable_zone: left_border=' + str(self.left_border)) logger.debug('__identify_extractable_zone: left_border=' + str(self.left_border))
DEFAULT_RIGHT_COLUMN = self.left_border + 2 DEFAULT_RIGHT_COLUMN = self.left_border + 2
if DEFAULT_RIGHT_COLUMN >= self.sheet.ncols: if DEFAULT_RIGHT_COLUMN >= self.data_provider.get_col_count():
return None, list(), list() return None, list(), list()
MIN_TABLE_HEIGHT = 3 MIN_TABLE_HEIGHT = 3
...@@ -98,20 +98,20 @@ class BoundaryFinder(): ...@@ -98,20 +98,20 @@ class BoundaryFinder():
bottom_row_id = MIN_TABLE_HEIGHT bottom_row_id = MIN_TABLE_HEIGHT
right_column = DEFAULT_RIGHT_COLUMN right_column = DEFAULT_RIGHT_COLUMN
# TODO: self.sheet.ncols might not be the real number of columns in case of many empty columns while right_column < self.data_provider.get_col_count():
while right_column < self.sheet.ncols:
if bottom_row_id >= self.sheet.nrows - 1: if bottom_row_id >= self.sheet.nrows - 1:
break break
for row_id in xrange(bottom_row_id, self.sheet.nrows): big_column = self.data_provider.get_col(right_column)
column = self.data_provider.get_col(right_column)[top_row_id:row_id + 1] for row_id in xrange(bottom_row_id, self.data_provider.get_row_count()):
bottom_row = self.data_provider.get_row(row_id)[left_column:right_column + 1] column = big_column[top_row_id:row_id + 1]
bottom_row = self.data_provider.get_row(row_id, left_index=left_column, right_index=right_column + 1)
if not self.data_provider.row_contains_merge_cells(row_id, left_column, right_column + 1) and \ if not self.data_provider.row_contains_merge_cells(row_id, left_column, right_column + 1) and \
(row_id == self.sheet.nrows - 1 or ExcelValidator.is_empty(bottom_row)): (row_id == self.sheet.nrows - 1 or ExcelValidator.is_empty(bottom_row)):
right_border = None right_border = None
if ExcelValidator.is_empty(column): if ExcelValidator.is_empty(column):
if not self.data_provider.column_contains_merged_cells(right_column, top_row_id, row_id + 1): if not self.data_provider.column_contains_merged_cells(right_column, top_row_id, row_id + 1):
right_border = right_column - 1 right_border = right_column - 1
elif right_column == self.sheet.ncols - 1: elif right_column == self.data_provider.get_col_count() - 1:
right_border = right_column right_border = right_column
if right_border: if right_border:
...@@ -131,7 +131,6 @@ class BoundaryFinder(): ...@@ -131,7 +131,6 @@ class BoundaryFinder():
list_empty_zones_post_extraction = list() list_empty_zones_post_extraction = list()
if extractable_zones and len(extractable_zones) > 1: if extractable_zones and len(extractable_zones) > 1:
logger.debug('Found some ExtractableZones') logger.debug('Found some ExtractableZones')
print right_column, self.sheet.ncols
list_borders = [extractable_zone.right for extractable_zone in extractable_zones] list_borders = [extractable_zone.right for extractable_zone in extractable_zones]
right_most_border = max(list_borders) right_most_border = max(list_borders)
left_most_border = min(list_borders) left_most_border = min(list_borders)
...@@ -154,12 +153,11 @@ class BoundaryFinder(): ...@@ -154,12 +153,11 @@ class BoundaryFinder():
if not self.empty_zones_post_extraction: if not self.empty_zones_post_extraction:
self.left_border = right_most_border + 1 self.left_border = right_most_border + 1
# TODO scan for empty rows to find out the real number of rows extractable_zone = ExtractableZone(left_column, right_most_border, 0, self.data_provider.get_row_count() - 1)
extractable_zone = ExtractableZone(left_column, right_most_border, 0, self.sheet.nrows - 1)
else: else:
logger.debug('Found no ExtractableZones') logger.debug('Found no ExtractableZones')
extractable_zone = ExtractableZone(left_column, self.sheet.ncols - 1, 0, self.sheet.nrows - 1) extractable_zone = ExtractableZone(left_column, self.data_provider.get_col_count() - 1, 0, self.data_provider.get_row_count() - 1)
self.left_border = self.sheet.ncols self.left_border = self.data_provider.get_col_count()
return extractable_zone, list_empty_zones_post_extraction, list_empty_zones_pre_extraction return extractable_zone, list_empty_zones_post_extraction, list_empty_zones_pre_extraction
...@@ -167,9 +165,9 @@ class BoundaryFinder(): ...@@ -167,9 +165,9 @@ class BoundaryFinder():
extractable_zones = list() extractable_zones = list()
left_column = 0 left_column = 0
for right_column in xrange(1, self.sheet.ncols): for right_column in xrange(1, self.data_provider.get_col_count()):
column = self.data_provider.get_col(right_column) column = self.data_provider.get_col(right_column)
if (right_column == self.sheet.ncols - 1 or ExcelValidator.is_empty(column)) \ if (right_column == self.data_provider.get_col_count() - 1 or ExcelValidator.is_empty(column)) \
and right_column - left_column >= 2: and right_column - left_column >= 2:
is_empty_zone = True is_empty_zone = True
......
...@@ -2,6 +2,7 @@ import xlrd ...@@ -2,6 +2,7 @@ import xlrd
from xlrd.sheet import Cell from xlrd.sheet import Cell
import utils import utils
from config import *
from data_model import EmptyZone from data_model import EmptyZone
from excel_utils import MISSING_VALUE_NOTATIONS, ExcelValidator from excel_utils import MISSING_VALUE_NOTATIONS, ExcelValidator
...@@ -11,6 +12,8 @@ class DataProvider(): ...@@ -11,6 +12,8 @@ class DataProvider():
self.sheet = sheet self.sheet = sheet
self.dict_merged_cells = self.build_dict_merged_cells() self.dict_merged_cells = self.build_dict_merged_cells()
self.list_empty_zones = list() self.list_empty_zones = list()
self.row_count = None
self.col_count = None
@classmethod @classmethod
def data_cell_value(cls, cell): def data_cell_value(cls, cell):
...@@ -46,18 +49,44 @@ class DataProvider(): ...@@ -46,18 +49,44 @@ class DataProvider():
print 'set_empty_zones', empty_zone print 'set_empty_zones', empty_zone
self.list_empty_zones.extend(list_empty_zones) self.list_empty_zones.extend(list_empty_zones)
def get_row(self, row_id, extractable_zone=None): def get_row(self, row_id, extractable_zone=None, left_index=None, right_index=None):
cells = [self.cell(row_id, col_id) for col_id in xrange(self.sheet.ncols)] if left_index and right_index:
cells = [self.cell(row_id, col_id) for col_id in xrange(left_index, min(self.get_col_count(), right_index + 1))]
else:
cells = [self.cell(row_id, col_id) for col_id in xrange(self.get_col_count())]
if extractable_zone: if extractable_zone:
return cells[extractable_zone.left: extractable_zone.right] return cells[extractable_zone.left: extractable_zone.right]
else: else:
return cells return cells
def get_col(self, col_id): def get_col(self, col_id):
return [self.cell(row_id, col_id) for row_id in xrange(self.sheet.nrows)] return [self.cell(row_id, col_id) for row_id in xrange(self.get_row_count())]
def get_rows_count(self): def get_row_count(self):
return self.sheet.nrows if self.row_count is None:
self.row_count = self.sheet.nrows
num_continuous_empty_rows = 0
for rx in range(self.sheet.nrows):
if ExcelValidator.is_empty(self.sheet.row(rx)):
num_continuous_empty_rows += 1
if num_continuous_empty_rows >= MAX_NUM_OF_EMPTY_ROWS:
self.row_count = rx - MAX_NUM_OF_EMPTY_ROWS
break
else:
num_continuous_empty_rows = 0
return self.row_count
def get_col_count(self):
if self.col_count is None:
right_most_empty_col_id = self.sheet.ncols - 1
for col_id in reversed(xrange(self.sheet.ncols)):
if ExcelValidator.is_empty(self.sheet.col(col_id)):
right_most_empty_col_id = col_id
else:
break
self.col_count = right_most_empty_col_id + 1
return self.col_count
def cell(self, row, col): def cell(self, row, col):
if self.belongs_to_empy_zones(row, col): if self.belongs_to_empy_zones(row, col):
......
...@@ -5,7 +5,7 @@ def get_log(file_name): ...@@ -5,7 +5,7 @@ def get_log(file_name):
log_name = file_name[:file_name.index('.')] log_name = file_name[:file_name.index('.')]
logger = logging.getLogger(log_name) logger = logging.getLogger(log_name)
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.DEBUG)
handler = logging.FileHandler(file_name) handler = logging.FileHandler(file_name)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter) handler.setFormatter(formatter)
......
...@@ -36,7 +36,7 @@ class TableExtractor(): ...@@ -36,7 +36,7 @@ class TableExtractor():
table = Table() table = Table()
if self.data_provider.get_rows_count() > 0: if self.data_provider.get_row_count() > 0:
# TODO: extract title # TODO: extract title
sheet_title = self.data_provider.get_merged_cell(0, 0).value sheet_title = self.data_provider.get_merged_cell(0, 0).value
......
No preview for this file type
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment