Commit c2000847 authored by Marko Kuder's avatar Marko Kuder

prevent some CSV false positives

parent 0676ee9d
......@@ -221,6 +221,9 @@ def _is_spreadsheet(table_set, format, log):
return 0
return float(num_cells) / float(num_rows)
num_cells = num_rows = 0
# it seems messytables sometimes uses newline as a delimiter and treats a regular txt
# file as 2-column CSV with last column empty - if this happens, ignore one column
last_row_always_empty = True
try:
table = table_set.tables[0]
# Iterate through the table.sample (sample because otherwise
......@@ -229,16 +232,21 @@ def _is_spreadsheet(table_set, format, log):
if row:
# Must have enough cells
num_cells += len(row)
if not row[-1].empty:
last_row_always_empty = False
num_rows += 1
if num_cells > 20 or num_rows > 10:
cells_per_row = get_cells_per_row(num_cells, num_rows)
# over the long term, 2 columns is the minimum
if cells_per_row > 1.9:
if cells_per_row > 1.9 and not last_row_always_empty or cells_per_row > 2.9:
log.info('Is %s because %.1f cells per row (%i cells, %i rows)', \
format,
get_cells_per_row(num_cells, num_rows),
num_cells, num_rows)
return True
else: #file is long, but there is not a lot of columns
return False
finally:
pass
# if file is short then be more lenient
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment