diff --git a/boundary_finder.py b/boundary_finder.py index 972a5352d158cbfc053573ad43026589090b5460..d3779f072eee211f49ddf0cce369af1ce6dbfd8f 100644 --- a/boundary_finder.py +++ b/boundary_finder.py @@ -88,7 +88,7 @@ class BoundaryFinder(): logger.debug('__identify_extractable_zone: left_border=' + str(self.left_border)) DEFAULT_RIGHT_COLUMN = self.left_border + 2 - if DEFAULT_RIGHT_COLUMN >= self.sheet.ncols: + if DEFAULT_RIGHT_COLUMN >= self.data_provider.get_col_count(): return None, list(), list() MIN_TABLE_HEIGHT = 3 @@ -98,20 +98,20 @@ class BoundaryFinder(): bottom_row_id = MIN_TABLE_HEIGHT right_column = DEFAULT_RIGHT_COLUMN - # TODO: self.sheet.ncols might not be the real number of columns in case of many empty columns - while right_column < self.sheet.ncols: + while right_column < self.data_provider.get_col_count(): if bottom_row_id >= self.sheet.nrows - 1: break - for row_id in xrange(bottom_row_id, self.sheet.nrows): - column = self.data_provider.get_col(right_column)[top_row_id:row_id + 1] - bottom_row = self.data_provider.get_row(row_id)[left_column:right_column + 1] + big_column = self.data_provider.get_col(right_column) + for row_id in xrange(bottom_row_id, self.data_provider.get_row_count()): + column = big_column[top_row_id:row_id + 1] + bottom_row = self.data_provider.get_row(row_id, left_index=left_column, right_index=right_column + 1) if not self.data_provider.row_contains_merge_cells(row_id, left_column, right_column + 1) and \ (row_id == self.sheet.nrows - 1 or ExcelValidator.is_empty(bottom_row)): right_border = None if ExcelValidator.is_empty(column): if not self.data_provider.column_contains_merged_cells(right_column, top_row_id, row_id + 1): right_border = right_column - 1 - elif right_column == self.sheet.ncols - 1: + elif right_column == self.data_provider.get_col_count() - 1: right_border = right_column if right_border: @@ -131,7 +131,6 @@ class BoundaryFinder(): list_empty_zones_post_extraction = list() if extractable_zones and len(extractable_zones) > 1: logger.debug('Found some ExtractableZones') - print right_column, self.sheet.ncols list_borders = [extractable_zone.right for extractable_zone in extractable_zones] right_most_border = max(list_borders) left_most_border = min(list_borders) @@ -154,12 +153,11 @@ class BoundaryFinder(): if not self.empty_zones_post_extraction: self.left_border = right_most_border + 1 - # TODO scan for empty rows to find out the real number of rows - extractable_zone = ExtractableZone(left_column, right_most_border, 0, self.sheet.nrows - 1) + extractable_zone = ExtractableZone(left_column, right_most_border, 0, self.data_provider.get_row_count() - 1) else: logger.debug('Found no ExtractableZones') - extractable_zone = ExtractableZone(left_column, self.sheet.ncols - 1, 0, self.sheet.nrows - 1) - self.left_border = self.sheet.ncols + extractable_zone = ExtractableZone(left_column, self.data_provider.get_col_count() - 1, 0, self.data_provider.get_row_count() - 1) + self.left_border = self.data_provider.get_col_count() return extractable_zone, list_empty_zones_post_extraction, list_empty_zones_pre_extraction @@ -167,9 +165,9 @@ class BoundaryFinder(): extractable_zones = list() left_column = 0 - for right_column in xrange(1, self.sheet.ncols): + for right_column in xrange(1, self.data_provider.get_col_count()): column = self.data_provider.get_col(right_column) - if (right_column == self.sheet.ncols - 1 or ExcelValidator.is_empty(column)) \ + if (right_column == self.data_provider.get_col_count() - 1 or ExcelValidator.is_empty(column)) \ and right_column - left_column >= 2: is_empty_zone = True diff --git a/data_provider.py b/data_provider.py index 9ee96645d8a25752a3fe338bfb6ee95a978bcc04..a9ac8490ed7f9316bbb0bf391a2ae76dc16376b6 100644 --- a/data_provider.py +++ b/data_provider.py @@ -2,6 +2,7 @@ import xlrd from xlrd.sheet import Cell import utils +from config import * from data_model import EmptyZone from excel_utils import MISSING_VALUE_NOTATIONS, ExcelValidator @@ -11,6 +12,8 @@ class DataProvider(): self.sheet = sheet self.dict_merged_cells = self.build_dict_merged_cells() self.list_empty_zones = list() + self.row_count = None + self.col_count = None @classmethod def data_cell_value(cls, cell): @@ -46,18 +49,44 @@ class DataProvider(): print 'set_empty_zones', empty_zone self.list_empty_zones.extend(list_empty_zones) - def get_row(self, row_id, extractable_zone=None): - cells = [self.cell(row_id, col_id) for col_id in xrange(self.sheet.ncols)] + def get_row(self, row_id, extractable_zone=None, left_index=None, right_index=None): + if left_index and right_index: + cells = [self.cell(row_id, col_id) for col_id in xrange(left_index, min(self.get_col_count(), right_index + 1))] + else: + cells = [self.cell(row_id, col_id) for col_id in xrange(self.get_col_count())] if extractable_zone: return cells[extractable_zone.left: extractable_zone.right] else: return cells def get_col(self, col_id): - return [self.cell(row_id, col_id) for row_id in xrange(self.sheet.nrows)] - - def get_rows_count(self): - return self.sheet.nrows + return [self.cell(row_id, col_id) for row_id in xrange(self.get_row_count())] + + def get_row_count(self): + if self.row_count is None: + self.row_count = self.sheet.nrows + num_continuous_empty_rows = 0 + for rx in range(self.sheet.nrows): + if ExcelValidator.is_empty(self.sheet.row(rx)): + num_continuous_empty_rows += 1 + if num_continuous_empty_rows >= MAX_NUM_OF_EMPTY_ROWS: + self.row_count = rx - MAX_NUM_OF_EMPTY_ROWS + break + else: + num_continuous_empty_rows = 0 + return self.row_count + + def get_col_count(self): + if self.col_count is None: + right_most_empty_col_id = self.sheet.ncols - 1 + for col_id in reversed(xrange(self.sheet.ncols)): + if ExcelValidator.is_empty(self.sheet.col(col_id)): + right_most_empty_col_id = col_id + else: + break + self.col_count = right_most_empty_col_id + 1 + + return self.col_count def cell(self, row, col): if self.belongs_to_empy_zones(row, col): diff --git a/log_utils.py b/log_utils.py index db028923782847c24ffe3e2841e3f60081b3866f..b0114b97d0539785540a0eebb0f6b68dabb294ca 100644 --- a/log_utils.py +++ b/log_utils.py @@ -5,7 +5,7 @@ def get_log(file_name): log_name = file_name[:file_name.index('.')] logger = logging.getLogger(log_name) - logging.basicConfig(level=logging.INFO) + logging.basicConfig(level=logging.DEBUG) handler = logging.FileHandler(file_name) formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) diff --git a/table_extractor.py b/table_extractor.py index 1a9c92a80335a08273a3530c1b3c6e07e469cdd3..c9d5492f9f2aca4935a7fdbcffc95817d33d5752 100644 --- a/table_extractor.py +++ b/table_extractor.py @@ -36,7 +36,7 @@ class TableExtractor(): table = Table() - if self.data_provider.get_rows_count() > 0: + if self.data_provider.get_row_count() > 0: # TODO: extract title sheet_title = self.data_provider.get_merged_cell(0, 0).value diff --git a/test_data/test_multiple_tables2b.xls b/test_data/test_multiple_tables2b.xls index 8ee4463aef150d48e8495e20bb42bf53a1ff400d..f982596d01f0a31635ab3202c3cdb23091120201 100644 Binary files a/test_data/test_multiple_tables2b.xls and b/test_data/test_multiple_tables2b.xls differ