Solve the general case of multiple tables per sheet. Fix failed test cases

fd8e7b16 · Duc Cao · 31c18dc1 · fd8e7b16 · fd8e7b16 · fd8e7b16
Commit fd8e7b16 authored 8 years ago by Duc Cao
--- a/boundary_finder.py
+++ b/boundary_finder.py
@@ -19,6 +19,9 @@ class BoundaryFinder():
 		self.data_provider = data_provider
 		self.dict_rows_signatures = dict()
 		self.last_row_id = self.sheet.nrows - 1
+		self.left_border = 0
+		self.empty_zones_post_extraction = list()
+		self.empty_zones_pre_extraction = list()
 	def __build_row_signatures(self, extractable_zone):
 		dict_rows_signatures = dict()
@@ -82,14 +85,20 @@ class BoundaryFinder():
 		return list_table_bounding_box
 	def __identify_extractable_zone(self):
-		DEFAULT_RIGHT_COLUMN = 2
+		logger.debug('__identify_extractable_zone: left_border=' + str(self.left_border))
+		DEFAULT_RIGHT_COLUMN = self.left_border + 2
+		if DEFAULT_RIGHT_COLUMN >= self.sheet.ncols:
+			return None, list(), list()
 		MIN_TABLE_HEIGHT = 3
 		extractable_zones = list()
-		left_column = 0
+		left_column = self.left_border
 		top_row_id = 0
 		bottom_row_id = MIN_TABLE_HEIGHT
 		right_column = DEFAULT_RIGHT_COLUMN
+		# TODO: self.sheet.ncols might not be the real number of columns in case of many empty columns
 		while right_column < self.sheet.ncols:
 			if bottom_row_id >= self.sheet.nrows - 1:
 				break
@@ -112,32 +121,47 @@ class BoundaryFinder():
 						right_column = DEFAULT_RIGHT_COLUMN
 						top_row_id = row_id + 1
 						bottom_row_id = top_row_id + MIN_TABLE_HEIGHT + 1
-						left_column = 0
+						left_column = self.left_border
 						break
 			right_column += 1
-		# TODO after extraction: all cells from left-most border to the right border
+		# before extraction: all cells from right border to the right
-		# before extraction: all cells from right border to right-most border
+		list_empty_zones_pre_extraction = list()
-		if extractable_zones:
+		# after extraction: all cells from left-most border to the right
+		list_empty_zones_post_extraction = list()
+		if extractable_zones and len(extractable_zones) > 1:
 			logger.debug('Found some ExtractableZones')
-			list_right_borders = [extractable_zone.right for extractable_zone in extractable_zones]
+			print right_column, self.sheet.ncols
-			right_most_border = max(list_right_borders)
+			list_borders = [extractable_zone.right for extractable_zone in extractable_zones]
-			list_empty_zones = list()
+			right_most_border = max(list_borders)
+			left_most_border = min(list_borders)
+			# print left_most_border, right_most_border
 			for extractable_zone in extractable_zones:
+				logger.debug('Found an ' + str(extractable_zone))
 				if extractable_zone.right + 1 < right_most_border:
-					logger.debug('Found an ' + str(extractable_zone))
+					list_empty_zones_pre_extraction.append(EmptyZone(
-					list_empty_zones.append(EmptyZone(
 						extractable_zone.right + 1, right_most_border,
 						extractable_zone.top, extractable_zone.bottom
 					))
-					# logger.debug('Found an ' + str(list_empty_zones[-1]))
+					logger.debug('list_empty_zones_pre_extraction ' + str(list_empty_zones_pre_extraction[-1]))
-			self.data_provider.set_empty_zones(list_empty_zones)
+				if extractable_zone.right > left_most_border:
+					# logger.debug('Found an ' + str(extractable_zone))
+					list_empty_zones_post_extraction.append(EmptyZone(
+						left_most_border + 1, extractable_zone.right,
+						extractable_zone.top, extractable_zone.bottom
+					))
+					logger.debug('list_empty_zones_post_extraction ' + str(list_empty_zones_post_extraction[-1]))
+			if not self.empty_zones_post_extraction:
+				self.left_border = right_most_border + 1
 			# TODO scan for empty rows to find out the real number of rows
 			extractable_zone = ExtractableZone(left_column, right_most_border, 0, self.sheet.nrows - 1)
 		else:
 			logger.debug('Found no ExtractableZones')
 			extractable_zone = ExtractableZone(left_column, self.sheet.ncols - 1, 0, self.sheet.nrows - 1)
-		return extractable_zone
+			self.left_border = self.sheet.ncols
+		return extractable_zone, list_empty_zones_post_extraction, list_empty_zones_pre_extraction
 	def __identify_extractable_zones(self):
 		extractable_zones = list()
@@ -166,51 +190,59 @@ class BoundaryFinder():
 		return extractable_zones
-	def find_list_table_boundary(self):
+	def next_list_table_boundary(self):
 		""" Extract a list of TableBoundary from Sheet object
 		Returns:
 			list of TableBoundary objects
 		"""
-		extractable_zones = [self.__identify_extractable_zone()]
+		if self.empty_zones_post_extraction:
+			self.left_border = self.empty_zones_post_extraction[0].right
+			self.data_provider.set_empty_zones(self.empty_zones_post_extraction)
+		extractable_zone, self.empty_zones_post_extraction, self.empty_zones_pre_extraction = self.__identify_extractable_zone()
+		if not extractable_zone:
+			return None
 		all_table_boundary = list()
 		self.all_rows_signatures = dict()
-		for extractable_zone in extractable_zones:
+		logger.debug('Process extractable_zone ' + str(extractable_zone))
-			logger.debug('Process extractable_zone ' + str(extractable_zone))
+		list_table_boundary = list()
-			list_table_boundary = list()
-			# TODO need different row signatures for different extractable_zone
-			self.dict_rows_signatures = self.__build_row_signatures(extractable_zone)
-			list_separations = self.__build_list_separations()
-			if list_separations:
-				list_table_bounding_box = self.__build_list_table_bounding_box(list_separations, extractable_zone)
-				for (previous_separation, separation, first_column_id, last_column_id) in list_table_bounding_box:
-					range_of_rows = range(previous_separation, separation + 1)
-					list_table_boundary.append(TableBoundary(
-						first_column_id,
-						# self.identify_first_column_of_data_cells(range_of_rows, first_column_id, last_column_id),
-						self.identify_first_column_of_data_cells(range_of_rows, extractable_zone.left, extractable_zone.right),
-						last_column_id,
-						range_of_rows
-					))
-			if not list_table_boundary:
+		if self.empty_zones_pre_extraction:
-				range_of_rows = xrange(self.sheet.nrows)
+			self.data_provider.set_empty_zones(self.empty_zones_pre_extraction)
-				first_column_id = self.find_first_column_id(extractable_zone, range_of_rows)
+		# TODO need different row signatures for different extractable_zone
-				last_column_id = self.find_last_column_id(extractable_zone, range_of_rows)
+		self.dict_rows_signatures = self.__build_row_signatures(extractable_zone)
+		list_separations = self.__build_list_separations()
+		if list_separations:
+			list_table_bounding_box = self.__build_list_table_bounding_box(list_separations, extractable_zone)
+			for (previous_separation, separation, first_column_id, last_column_id) in list_table_bounding_box:
+				range_of_rows = range(previous_separation, separation + 1)
 				list_table_boundary.append(TableBoundary(
 					first_column_id,
+					# self.identify_first_column_of_data_cells(range_of_rows, first_column_id, last_column_id),
 					self.identify_first_column_of_data_cells(range_of_rows, extractable_zone.left, extractable_zone.right),
 					last_column_id,
 					range_of_rows
 				))
-			for table_boundary in list_table_boundary:
+		if not list_table_boundary:
-				logger.debug('Found table_boundary: ' + str(table_boundary))
+			range_of_rows = xrange(self.sheet.nrows)
-			all_table_boundary.extend(list_table_boundary)
+			first_column_id = self.find_first_column_id(extractable_zone, range_of_rows)
-			for table_boundary in list_table_boundary:
+			last_column_id = self.find_last_column_id(extractable_zone, range_of_rows)
-				self.all_rows_signatures[table_boundary] = copy.deepcopy(self.dict_rows_signatures)
+			list_table_boundary.append(TableBoundary(
+				first_column_id,
+				self.identify_first_column_of_data_cells(range_of_rows, extractable_zone.left, extractable_zone.right),
+				last_column_id,
+				range_of_rows
+			))
+		for table_boundary in list_table_boundary:
+			logger.debug('Found table_boundary: ' + str(table_boundary))
+		all_table_boundary.extend(list_table_boundary)
+		for table_boundary in list_table_boundary:
+			self.all_rows_signatures[table_boundary] = copy.deepcopy(self.dict_rows_signatures)
 		return all_table_boundary

--- a/data_model.py
+++ b/data_model.py
@@ -48,6 +48,7 @@ class HeaderZone():
 		axis (string): 'x' for x axis, 'y' for y axis
 		"""
 		self.headers = headers
+		self.axis = axis
 class DataZone():
 	def __init__(self, data_rows, data_row_idx, first_row=None, last_row=None):

--- a/data_provider.py
+++ b/data_provider.py
@@ -42,7 +42,9 @@ class DataProvider():
 		return Cell(xlrd.XL_CELL_EMPTY, '', self.sheet.cell(row, col).xf_index)
 	def set_empty_zones(self, list_empty_zones):
-		self.list_empty_zones = list_empty_zones
+		for empty_zone in list_empty_zones:
+			print 'set_empty_zones', empty_zone
+		self.list_empty_zones.extend(list_empty_zones)
 	def get_row(self, row_id, extractable_zone=None):
 		cells = [self.cell(row_id, col_id) for col_id in xrange(self.sheet.ncols)]

--- a/extractor.py
+++ b/extractor.py
@@ -66,18 +66,22 @@ class ExcelExtractor():
 		data_provider = DataProvider(sheet)
 		boundary_finder = BoundaryFinder(sheet, data_provider)
-		list_table_boundary = boundary_finder.find_list_table_boundary()
 		results = list()
-		for table_boundary in list_table_boundary:
+		list_table_boundary = boundary_finder.next_list_table_boundary()
-			table_extractor = TableExtractor(
+		while list_table_boundary:
-				data_provider,
+			for table_boundary in list_table_boundary:
-				table_boundary,
+				table_extractor = TableExtractor(
-				self.variable_sheet,
+					data_provider,
-				book.xf_list,
+					table_boundary,
-				boundary_finder
+					self.variable_sheet,
-			)
+					book.xf_list,
-			results.append(table_extractor.extract())
+					boundary_finder
+				)
+				table = table_extractor.extract()
+				if table:
+					results.append(table)
+			list_table_boundary = boundary_finder.next_list_table_boundary()
 		return results
 	def extract_file(self, excel_file, sheet_idx=None):

--- a/table_extractor.py
+++ b/table_extractor.py
@@ -52,6 +52,8 @@ class TableExtractor():
 				header_zone_x, header_zone_y, data_zone = HeaderZoneExtractor(self.xf_list, self.table_boundary, data_zone,
 					self.data_provider, self.variable_sheet).extract()
 				table = Table(header_zone_x, header_zone_y, data_zone, sheet_title, sheet_comment)
+			else:
+				table = None
 		# return (header_rows, header_columns, data_rows, sheet_title)
 		return table

--- a/test_extractor.py
+++ b/test_extractor.py
@@ -8,10 +8,14 @@ from extractor import ExcelExtractor
 def template_test(file_name, expected_header_rows, expected_header_columns, expected_data_rows):
 	extractor = ExcelExtractor()
-	table = extractor.extract_file(file_name, sheet_idx=[0])[0]
+	tables = extractor.extract_file(file_name, sheet_idx=[0])
-	header_rows = table.header_zone_x.headers
+	if tables:
-	header_columns = table.header_zone_y.headers
+		table = tables[0]
-	data_rows = table.data_zone.data_rows
+		header_rows = table.header_zone_x.headers
+		header_columns = table.header_zone_y.headers
+		data_rows = table.data_zone.data_rows
+	else:
+		header_rows, header_columns, data_rows = [], [], []
 	assert data_rows == expected_data_rows
 	assert header_rows == map(list, zip(*expected_header_rows))
 	assert header_columns == map(list, zip(*expected_header_columns))

--- a/test_extractor_multiple_tables2.py
+++ b/test_extractor_multiple_tables2.py
@@ -22,6 +22,10 @@ def test_multiple_tables0():
 			(
 				[[u'AZ']],
 				[[u'Consommation interm\xe9diaire', u'P2'], [u'Valeur ajout\xe9e brute', u'B1g'], [u'PRODUCTION DES BRANCHES', u'P1'], [u'Production marchande', u'P11'], [u'Prod. pour emploi final propre', u'P12'], [u'Production non marchande', u'P13']]
+			),
+			(
+				[[u'AZ'], [u'TOTAL']],
+				[[u'Agriculture, sylviculture et p\xeache', u'AZ'], [u'Industries extractives, \xe9nergie, eau, gestion des d\xe9chets et d\xe9pollution', u'DE'], [u'Correction CAF/FAB', u'PCAFAB'], [u'TOTAL', u'TOTAL']]
 			)
 		]
 	)
@@ -36,6 +40,10 @@ def test_multiple_tables2():
 			(
 				[[u'Production des produits (1)'], [u'Importations de biens'], [u'Importations de services'], [u'TOTAL DES RESSOURCES (3)']],
 				[[u'AZ'], [u'DE'], [u'PCAFAB'], [u'TOTAL']]
+			),
+			(
+				[[u'AZ'], [u'TOTAL']],
+				[[u'Agriculture, sylviculture et p\xeache', u'AZ'], [u'Industries extractives, \xe9nergie, eau, gestion des d\xe9chets et d\xe9pollution', u'DE'], [u'Correction CAF/FAB', u'PCAFAB'], [u'TOTAL', u'TOTAL']]
 			)
 		]
 	)