Mentions légales du service

Skip to content
Snippets Groups Projects
Commit fd8e7b16 authored by Duc Cao's avatar Duc Cao
Browse files

Solve the general case of multiple tables per sheet. Fix failed test cases

parent 31c18dc1
No related branches found
No related tags found
No related merge requests found
...@@ -19,6 +19,9 @@ class BoundaryFinder(): ...@@ -19,6 +19,9 @@ class BoundaryFinder():
self.data_provider = data_provider self.data_provider = data_provider
self.dict_rows_signatures = dict() self.dict_rows_signatures = dict()
self.last_row_id = self.sheet.nrows - 1 self.last_row_id = self.sheet.nrows - 1
self.left_border = 0
self.empty_zones_post_extraction = list()
self.empty_zones_pre_extraction = list()
def __build_row_signatures(self, extractable_zone): def __build_row_signatures(self, extractable_zone):
dict_rows_signatures = dict() dict_rows_signatures = dict()
...@@ -82,14 +85,20 @@ class BoundaryFinder(): ...@@ -82,14 +85,20 @@ class BoundaryFinder():
return list_table_bounding_box return list_table_bounding_box
def __identify_extractable_zone(self): def __identify_extractable_zone(self):
DEFAULT_RIGHT_COLUMN = 2 logger.debug('__identify_extractable_zone: left_border=' + str(self.left_border))
DEFAULT_RIGHT_COLUMN = self.left_border + 2
if DEFAULT_RIGHT_COLUMN >= self.sheet.ncols:
return None, list(), list()
MIN_TABLE_HEIGHT = 3 MIN_TABLE_HEIGHT = 3
extractable_zones = list() extractable_zones = list()
left_column = 0 left_column = self.left_border
top_row_id = 0 top_row_id = 0
bottom_row_id = MIN_TABLE_HEIGHT bottom_row_id = MIN_TABLE_HEIGHT
right_column = DEFAULT_RIGHT_COLUMN right_column = DEFAULT_RIGHT_COLUMN
# TODO: self.sheet.ncols might not be the real number of columns in case of many empty columns
while right_column < self.sheet.ncols: while right_column < self.sheet.ncols:
if bottom_row_id >= self.sheet.nrows - 1: if bottom_row_id >= self.sheet.nrows - 1:
break break
...@@ -112,32 +121,47 @@ class BoundaryFinder(): ...@@ -112,32 +121,47 @@ class BoundaryFinder():
right_column = DEFAULT_RIGHT_COLUMN right_column = DEFAULT_RIGHT_COLUMN
top_row_id = row_id + 1 top_row_id = row_id + 1
bottom_row_id = top_row_id + MIN_TABLE_HEIGHT + 1 bottom_row_id = top_row_id + MIN_TABLE_HEIGHT + 1
left_column = 0 left_column = self.left_border
break break
right_column += 1 right_column += 1
# TODO after extraction: all cells from left-most border to the right border # before extraction: all cells from right border to the right
# before extraction: all cells from right border to right-most border list_empty_zones_pre_extraction = list()
if extractable_zones: # after extraction: all cells from left-most border to the right
list_empty_zones_post_extraction = list()
if extractable_zones and len(extractable_zones) > 1:
logger.debug('Found some ExtractableZones') logger.debug('Found some ExtractableZones')
list_right_borders = [extractable_zone.right for extractable_zone in extractable_zones] print right_column, self.sheet.ncols
right_most_border = max(list_right_borders) list_borders = [extractable_zone.right for extractable_zone in extractable_zones]
list_empty_zones = list() right_most_border = max(list_borders)
left_most_border = min(list_borders)
# print left_most_border, right_most_border
for extractable_zone in extractable_zones: for extractable_zone in extractable_zones:
logger.debug('Found an ' + str(extractable_zone))
if extractable_zone.right + 1 < right_most_border: if extractable_zone.right + 1 < right_most_border:
logger.debug('Found an ' + str(extractable_zone)) list_empty_zones_pre_extraction.append(EmptyZone(
list_empty_zones.append(EmptyZone(
extractable_zone.right + 1, right_most_border, extractable_zone.right + 1, right_most_border,
extractable_zone.top, extractable_zone.bottom extractable_zone.top, extractable_zone.bottom
)) ))
# logger.debug('Found an ' + str(list_empty_zones[-1])) logger.debug('list_empty_zones_pre_extraction ' + str(list_empty_zones_pre_extraction[-1]))
self.data_provider.set_empty_zones(list_empty_zones) if extractable_zone.right > left_most_border:
# logger.debug('Found an ' + str(extractable_zone))
list_empty_zones_post_extraction.append(EmptyZone(
left_most_border + 1, extractable_zone.right,
extractable_zone.top, extractable_zone.bottom
))
logger.debug('list_empty_zones_post_extraction ' + str(list_empty_zones_post_extraction[-1]))
if not self.empty_zones_post_extraction:
self.left_border = right_most_border + 1
# TODO scan for empty rows to find out the real number of rows # TODO scan for empty rows to find out the real number of rows
extractable_zone = ExtractableZone(left_column, right_most_border, 0, self.sheet.nrows - 1) extractable_zone = ExtractableZone(left_column, right_most_border, 0, self.sheet.nrows - 1)
else: else:
logger.debug('Found no ExtractableZones') logger.debug('Found no ExtractableZones')
extractable_zone = ExtractableZone(left_column, self.sheet.ncols - 1, 0, self.sheet.nrows - 1) extractable_zone = ExtractableZone(left_column, self.sheet.ncols - 1, 0, self.sheet.nrows - 1)
return extractable_zone self.left_border = self.sheet.ncols
return extractable_zone, list_empty_zones_post_extraction, list_empty_zones_pre_extraction
def __identify_extractable_zones(self): def __identify_extractable_zones(self):
extractable_zones = list() extractable_zones = list()
...@@ -166,51 +190,59 @@ class BoundaryFinder(): ...@@ -166,51 +190,59 @@ class BoundaryFinder():
return extractable_zones return extractable_zones
def find_list_table_boundary(self): def next_list_table_boundary(self):
""" Extract a list of TableBoundary from Sheet object """ Extract a list of TableBoundary from Sheet object
Returns: Returns:
list of TableBoundary objects list of TableBoundary objects
""" """
extractable_zones = [self.__identify_extractable_zone()] if self.empty_zones_post_extraction:
self.left_border = self.empty_zones_post_extraction[0].right
self.data_provider.set_empty_zones(self.empty_zones_post_extraction)
extractable_zone, self.empty_zones_post_extraction, self.empty_zones_pre_extraction = self.__identify_extractable_zone()
if not extractable_zone:
return None
all_table_boundary = list() all_table_boundary = list()
self.all_rows_signatures = dict() self.all_rows_signatures = dict()
for extractable_zone in extractable_zones: logger.debug('Process extractable_zone ' + str(extractable_zone))
logger.debug('Process extractable_zone ' + str(extractable_zone)) list_table_boundary = list()
list_table_boundary = list()
# TODO need different row signatures for different extractable_zone
self.dict_rows_signatures = self.__build_row_signatures(extractable_zone)
list_separations = self.__build_list_separations()
if list_separations:
list_table_bounding_box = self.__build_list_table_bounding_box(list_separations, extractable_zone)
for (previous_separation, separation, first_column_id, last_column_id) in list_table_bounding_box:
range_of_rows = range(previous_separation, separation + 1)
list_table_boundary.append(TableBoundary(
first_column_id,
# self.identify_first_column_of_data_cells(range_of_rows, first_column_id, last_column_id),
self.identify_first_column_of_data_cells(range_of_rows, extractable_zone.left, extractable_zone.right),
last_column_id,
range_of_rows
))
if not list_table_boundary: if self.empty_zones_pre_extraction:
range_of_rows = xrange(self.sheet.nrows) self.data_provider.set_empty_zones(self.empty_zones_pre_extraction)
first_column_id = self.find_first_column_id(extractable_zone, range_of_rows) # TODO need different row signatures for different extractable_zone
last_column_id = self.find_last_column_id(extractable_zone, range_of_rows) self.dict_rows_signatures = self.__build_row_signatures(extractable_zone)
list_separations = self.__build_list_separations()
if list_separations:
list_table_bounding_box = self.__build_list_table_bounding_box(list_separations, extractable_zone)
for (previous_separation, separation, first_column_id, last_column_id) in list_table_bounding_box:
range_of_rows = range(previous_separation, separation + 1)
list_table_boundary.append(TableBoundary( list_table_boundary.append(TableBoundary(
first_column_id, first_column_id,
# self.identify_first_column_of_data_cells(range_of_rows, first_column_id, last_column_id),
self.identify_first_column_of_data_cells(range_of_rows, extractable_zone.left, extractable_zone.right), self.identify_first_column_of_data_cells(range_of_rows, extractable_zone.left, extractable_zone.right),
last_column_id, last_column_id,
range_of_rows range_of_rows
)) ))
for table_boundary in list_table_boundary: if not list_table_boundary:
logger.debug('Found table_boundary: ' + str(table_boundary)) range_of_rows = xrange(self.sheet.nrows)
all_table_boundary.extend(list_table_boundary) first_column_id = self.find_first_column_id(extractable_zone, range_of_rows)
for table_boundary in list_table_boundary: last_column_id = self.find_last_column_id(extractable_zone, range_of_rows)
self.all_rows_signatures[table_boundary] = copy.deepcopy(self.dict_rows_signatures) list_table_boundary.append(TableBoundary(
first_column_id,
self.identify_first_column_of_data_cells(range_of_rows, extractable_zone.left, extractable_zone.right),
last_column_id,
range_of_rows
))
for table_boundary in list_table_boundary:
logger.debug('Found table_boundary: ' + str(table_boundary))
all_table_boundary.extend(list_table_boundary)
for table_boundary in list_table_boundary:
self.all_rows_signatures[table_boundary] = copy.deepcopy(self.dict_rows_signatures)
return all_table_boundary return all_table_boundary
......
...@@ -48,6 +48,7 @@ class HeaderZone(): ...@@ -48,6 +48,7 @@ class HeaderZone():
axis (string): 'x' for x axis, 'y' for y axis axis (string): 'x' for x axis, 'y' for y axis
""" """
self.headers = headers self.headers = headers
self.axis = axis
class DataZone(): class DataZone():
def __init__(self, data_rows, data_row_idx, first_row=None, last_row=None): def __init__(self, data_rows, data_row_idx, first_row=None, last_row=None):
......
...@@ -42,7 +42,9 @@ class DataProvider(): ...@@ -42,7 +42,9 @@ class DataProvider():
return Cell(xlrd.XL_CELL_EMPTY, '', self.sheet.cell(row, col).xf_index) return Cell(xlrd.XL_CELL_EMPTY, '', self.sheet.cell(row, col).xf_index)
def set_empty_zones(self, list_empty_zones): def set_empty_zones(self, list_empty_zones):
self.list_empty_zones = list_empty_zones for empty_zone in list_empty_zones:
print 'set_empty_zones', empty_zone
self.list_empty_zones.extend(list_empty_zones)
def get_row(self, row_id, extractable_zone=None): def get_row(self, row_id, extractable_zone=None):
cells = [self.cell(row_id, col_id) for col_id in xrange(self.sheet.ncols)] cells = [self.cell(row_id, col_id) for col_id in xrange(self.sheet.ncols)]
......
...@@ -66,18 +66,22 @@ class ExcelExtractor(): ...@@ -66,18 +66,22 @@ class ExcelExtractor():
data_provider = DataProvider(sheet) data_provider = DataProvider(sheet)
boundary_finder = BoundaryFinder(sheet, data_provider) boundary_finder = BoundaryFinder(sheet, data_provider)
list_table_boundary = boundary_finder.find_list_table_boundary()
results = list() results = list()
for table_boundary in list_table_boundary: list_table_boundary = boundary_finder.next_list_table_boundary()
table_extractor = TableExtractor( while list_table_boundary:
data_provider, for table_boundary in list_table_boundary:
table_boundary, table_extractor = TableExtractor(
self.variable_sheet, data_provider,
book.xf_list, table_boundary,
boundary_finder self.variable_sheet,
) book.xf_list,
results.append(table_extractor.extract()) boundary_finder
)
table = table_extractor.extract()
if table:
results.append(table)
list_table_boundary = boundary_finder.next_list_table_boundary()
return results return results
def extract_file(self, excel_file, sheet_idx=None): def extract_file(self, excel_file, sheet_idx=None):
......
...@@ -52,6 +52,8 @@ class TableExtractor(): ...@@ -52,6 +52,8 @@ class TableExtractor():
header_zone_x, header_zone_y, data_zone = HeaderZoneExtractor(self.xf_list, self.table_boundary, data_zone, header_zone_x, header_zone_y, data_zone = HeaderZoneExtractor(self.xf_list, self.table_boundary, data_zone,
self.data_provider, self.variable_sheet).extract() self.data_provider, self.variable_sheet).extract()
table = Table(header_zone_x, header_zone_y, data_zone, sheet_title, sheet_comment) table = Table(header_zone_x, header_zone_y, data_zone, sheet_title, sheet_comment)
else:
table = None
# return (header_rows, header_columns, data_rows, sheet_title) # return (header_rows, header_columns, data_rows, sheet_title)
return table return table
......
...@@ -8,10 +8,14 @@ from extractor import ExcelExtractor ...@@ -8,10 +8,14 @@ from extractor import ExcelExtractor
def template_test(file_name, expected_header_rows, expected_header_columns, expected_data_rows): def template_test(file_name, expected_header_rows, expected_header_columns, expected_data_rows):
extractor = ExcelExtractor() extractor = ExcelExtractor()
table = extractor.extract_file(file_name, sheet_idx=[0])[0] tables = extractor.extract_file(file_name, sheet_idx=[0])
header_rows = table.header_zone_x.headers if tables:
header_columns = table.header_zone_y.headers table = tables[0]
data_rows = table.data_zone.data_rows header_rows = table.header_zone_x.headers
header_columns = table.header_zone_y.headers
data_rows = table.data_zone.data_rows
else:
header_rows, header_columns, data_rows = [], [], []
assert data_rows == expected_data_rows assert data_rows == expected_data_rows
assert header_rows == map(list, zip(*expected_header_rows)) assert header_rows == map(list, zip(*expected_header_rows))
assert header_columns == map(list, zip(*expected_header_columns)) assert header_columns == map(list, zip(*expected_header_columns))
......
...@@ -22,6 +22,10 @@ def test_multiple_tables0(): ...@@ -22,6 +22,10 @@ def test_multiple_tables0():
( (
[[u'AZ']], [[u'AZ']],
[[u'Consommation interm\xe9diaire', u'P2'], [u'Valeur ajout\xe9e brute', u'B1g'], [u'PRODUCTION DES BRANCHES', u'P1'], [u'Production marchande', u'P11'], [u'Prod. pour emploi final propre', u'P12'], [u'Production non marchande', u'P13']] [[u'Consommation interm\xe9diaire', u'P2'], [u'Valeur ajout\xe9e brute', u'B1g'], [u'PRODUCTION DES BRANCHES', u'P1'], [u'Production marchande', u'P11'], [u'Prod. pour emploi final propre', u'P12'], [u'Production non marchande', u'P13']]
),
(
[[u'AZ'], [u'TOTAL']],
[[u'Agriculture, sylviculture et p\xeache', u'AZ'], [u'Industries extractives, \xe9nergie, eau, gestion des d\xe9chets et d\xe9pollution', u'DE'], [u'Correction CAF/FAB', u'PCAFAB'], [u'TOTAL', u'TOTAL']]
) )
] ]
) )
...@@ -36,6 +40,10 @@ def test_multiple_tables2(): ...@@ -36,6 +40,10 @@ def test_multiple_tables2():
( (
[[u'Production des produits (1)'], [u'Importations de biens'], [u'Importations de services'], [u'TOTAL DES RESSOURCES (3)']], [[u'Production des produits (1)'], [u'Importations de biens'], [u'Importations de services'], [u'TOTAL DES RESSOURCES (3)']],
[[u'AZ'], [u'DE'], [u'PCAFAB'], [u'TOTAL']] [[u'AZ'], [u'DE'], [u'PCAFAB'], [u'TOTAL']]
),
(
[[u'AZ'], [u'TOTAL']],
[[u'Agriculture, sylviculture et p\xeache', u'AZ'], [u'Industries extractives, \xe9nergie, eau, gestion des d\xe9chets et d\xe9pollution', u'DE'], [u'Correction CAF/FAB', u'PCAFAB'], [u'TOTAL', u'TOTAL']]
) )
] ]
) )
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment