twneale · December 14, 2015 15:49
diff --git a/gistfile1.py b/gistfile1.py
 class PlaintextColumns(object):
    '''
    Parse plain text columns like this into a table:

    cols = """
        Austin             Errington      Lawson, L        Pryor
        Bartlett           Forestal       Macer            Riecken
        Battles            GiaQuinta      Moed             Shackleford
        Bauer              Goodin         Moseley          Smith, V
        Brown,C            Hale           Niezgodsk        Stemler
        Candelaria Reardon Harris         Pelath           Summers
        DeLaney            Kersey         Pierce           VanDenburgh
        Dvorak             Klinker        Porter
    """

    Usage:

    >>> table = PlaintextColumns(cols)

    >>> table.rows().next()
    ('Austin', 'Errington', 'Lawson, L', 'Pryor')

    >>> table.cols().next()

    ('Austin',
     'Bartlett',
     'Battles',
     'Bauer',
     'Brown,C',
     'Candelaria Reardon',
     'DeLaney',
     'Dvorak')

    >>> list(table.cells())
    ['Austin', 'Errington', 'Lawson, L', ...]
    '''
    def __init__(self, text, threshold=5):
        '''Threshold is how many times a column boundary (an integer offset
        from the beginning of the line) must be found in order to qualify
        as a boundary and not an outlier.
        '''
        self.text = text.strip()
        self.threshold = threshold

    def _get_column_ends(self):
        '''Guess where the ends of the columns lie.
        '''
        ends = collections.Counter()
        for line in self.text.splitlines():
            for matchobj in re.finditer('\s{2,}', line.lstrip()):
                ends[matchobj.end()] += 1
        return ends

    def _get_column_boundaries(self):
        '''Use the guessed ends to guess the boundaries of the plain
        text columns.
        '''
        # Try to figure out the most common column boundaries.
        ends = self._get_column_ends()
        if not ends:
            # If there aren't even any nontrivial sequences of whitespace
            # dividing text, there may be just one column. In which case,
            # Return a single span, effectively the whole line.
            return [slice(None, None)]

        most_common = []
        threshold = self.threshold
        for k, v in collections.Counter(ends.values()).most_common():
            if k >= threshold:
                most_common.append(k)

        if most_common:
            boundaries = []
            for k, v in ends.items():
                if v in most_common:
                    boundaries.append(k)
        else:
            # Here there weren't enough boundaries to guess the most common
            # ones, so just use the apparent boundaries. In other words, we
            # have only 1 row. Potentially a source of inaccuracy.
            boundaries = ends.keys()

        # Convert the boundaries into a list of span slices.
        boundaries.sort()
        last_boundary = boundaries[-1]
        boundaries = zip([0] + boundaries, boundaries)
        boundaries = list(itertools.starmap(slice, boundaries))

        # And get from the last boundary to the line ending.
        boundaries.append(slice(last_boundary, None))
        return boundaries

    @property
    def boundaries(self):
        _boundaries = getattr(self, '_boundaries', None)
        if _boundaries is not None:
            return _boundaries
        self._boundaries = _boundaries = self._get_column_boundaries()
        return _boundaries

    def getcells(self, line):
        '''Using self.boundaries, extract cells from the given line.
        '''
        for boundary in self.boundaries:
            cell = line.lstrip()[boundary].strip()
            if cell:
                yield cell
            else:
                yield None

    def rows(self):
        '''Returns an iterator of row tuples.
        '''
        for line in self.text.splitlines():
            yield tuple(self.getcells(line))

    def cells(self):
        '''Returns an interator of all cells in the table.
        '''
        for line in self.text.splitlines():
            for cell in self.getcells(line):
                yield cell

    def cols(self):
        '''Returns an interator of column tuples.
        '''
        return itertools.izip(*list(self.rows()))

    __iter__ = cells
	class PlaintextColumns(object):
	'''
	Parse plain text columns like this into a table:

	cols = """
	Austin Errington Lawson, L Pryor
	Bartlett Forestal Macer Riecken
	Battles GiaQuinta Moed Shackleford
	Bauer Goodin Moseley Smith, V
	Brown,C Hale Niezgodsk Stemler
	Candelaria Reardon Harris Pelath Summers
	DeLaney Kersey Pierce VanDenburgh
	Dvorak Klinker Porter
	"""

	Usage:

	>>> table = PlaintextColumns(cols)

	>>> table.rows().next()
	('Austin', 'Errington', 'Lawson, L', 'Pryor')

	>>> table.cols().next()

	('Austin',
	'Bartlett',
	'Battles',
	'Bauer',
	'Brown,C',
	'Candelaria Reardon',
	'DeLaney',
	'Dvorak')

	>>> list(table.cells())
	['Austin', 'Errington', 'Lawson, L', ...]
	'''
	def __init__(self, text, threshold=5):
	'''Threshold is how many times a column boundary (an integer offset
	from the beginning of the line) must be found in order to qualify
	as a boundary and not an outlier.
	'''
	self.text = text.strip()
	self.threshold = threshold

	def _get_column_ends(self):
	'''Guess where the ends of the columns lie.
	'''
	ends = collections.Counter()
	for line in self.text.splitlines():
	for matchobj in re.finditer('\s{2,}', line.lstrip()):
	ends[matchobj.end()] += 1
	return ends

	def _get_column_boundaries(self):
	'''Use the guessed ends to guess the boundaries of the plain
	text columns.
	'''
	# Try to figure out the most common column boundaries.
	ends = self._get_column_ends()
	if not ends:
	# If there aren't even any nontrivial sequences of whitespace
	# dividing text, there may be just one column. In which case,
	# Return a single span, effectively the whole line.
	return [slice(None, None)]

	most_common = []
	threshold = self.threshold
	for k, v in collections.Counter(ends.values()).most_common():
	if k >= threshold:
	most_common.append(k)

	if most_common:
	boundaries = []
	for k, v in ends.items():
	if v in most_common:
	boundaries.append(k)
	else:
	# Here there weren't enough boundaries to guess the most common
	# ones, so just use the apparent boundaries. In other words, we
	# have only 1 row. Potentially a source of inaccuracy.
	boundaries = ends.keys()

	# Convert the boundaries into a list of span slices.
	boundaries.sort()
	last_boundary = boundaries[-1]
	boundaries = zip([0] + boundaries, boundaries)
	boundaries = list(itertools.starmap(slice, boundaries))

	# And get from the last boundary to the line ending.
	boundaries.append(slice(last_boundary, None))
	return boundaries

	@property
	def boundaries(self):
	_boundaries = getattr(self, '_boundaries', None)
	if _boundaries is not None:
	return _boundaries
	self._boundaries = _boundaries = self._get_column_boundaries()
	return _boundaries

	def getcells(self, line):
	'''Using self.boundaries, extract cells from the given line.
	'''
	for boundary in self.boundaries:
	cell = line.lstrip()[boundary].strip()
	if cell:
	yield cell
	else:
	yield None

	def rows(self):
	'''Returns an iterator of row tuples.
	'''
	for line in self.text.splitlines():
	yield tuple(self.getcells(line))

	def cells(self):
	'''Returns an interator of all cells in the table.
	'''
	for line in self.text.splitlines():
	for cell in self.getcells(line):
	yield cell

	def cols(self):
	'''Returns an interator of column tuples.
	'''
	return itertools.izip(*list(self.rows()))

	__iter__ = cells