nischalshrestha · April 5, 2019 21:34
diff --git a/pandas_parser.py b/pandas_parser.py
 # dependencies:
 # - Python3.6
 # - Lark: pip install lark-parser

 import sys
 import os.path
 from lark import Lark, Transformer, Visitor
 from lark import Tree
 # https://github.com/lark-parser/lark/blob/master/lark/grammars/common.lark

 pandas_grammar = """
    start: subset+                                  -> exprs
    data: "df"                                      -> df
    subset: data (rows | cols | iloc)                           

    rows: "[" _index "]"
    cols: "[[" label ("," label)* "]]"
    iloc: "." "iloc" (rows | _rows_cols)

    _rows_cols: "[" left ("," right?)? "]"
    left: _index
    right: _index

    _index: range | NUMBER
    range: start_idx ":" end_idx
    start_idx: NUMBER*
    end_idx: NUMBER*
    label: "'" WORD "'"

    %import common.LETTER
    %import common.INT -> NUMBER
    %import common.WORD
    %import common.WS
    %ignore WS
 """
 rows =  """
        df[0] df[0:1] df[10:100] df[:1] df[1:]
        """
 cols =  """
        df[['a']] df[['aaa']] df[['a', 'b', 'c']] df[['aaa', 'bbb', 'ccc']]
        """
 iloc_rows = """
            df.iloc[1] df.iloc[1, ] df.iloc[:] df.iloc[0:] df.iloc[:1]
            df.iloc[0:1] df.iloc[0:1, 0:1] df.iloc[0:, 0:] df.iloc[:1, :1]
            """

 parser = Lark(pandas_grammar, keep_all_tokens=False)
 parser_ast = Lark(pandas_grammar)
 print('\nrows')
 print(parser.parse(rows).pretty())
 print('\ncols')
 print(parser.parse(cols).pretty())
 print('\niloc')
 print(parser.parse(iloc_rows).pretty())

 class LeftRightVisitor(Visitor):

    left_tree = None
    right_tree = None

    def left(self, tree):
        self.left_tree = tree.children[0]
        print(tree.data, tree)
    
    def right(self, tree):
        self.right_tree = tree.children[0]
        print(tree.data, tree)
    
 text = "df.iloc[:,]"
 parse_tree = parser.parse(text)
 print(parse_tree.pretty())
 visitor = LeftRightVisitor()
 visitor.visit(parse_tree)

 print('-----')

 class IndexTranslator(Transformer):

    def __init__(self, side, *args, **kwargs):
        self.side = side

    def start_idx(self, matches):
        return str(int(matches[0]) + 1) if len(matches) == 1 else '1'

    def end_idx(self, matches):
        if len(matches) == 1:
            return matches[0]
        elif self.side == 'left':
            return 'nrow(df)'
        elif self.side == 'right':
            return 'ncol(df)'

 # TODO put this into a function for handling iloc (maybe it can handle regular ones too)
 new_idx = ''
 new_left_idx = ''
 new_right_idx = ''
 if visitor.left_tree == None and visitor.right_tree == None:
    new_idx = ''
 if visitor.left_tree != None:
    new_left_idx = IndexTranslator('left').transform(visitor.left_tree)
    new_left_idx = new_left_idx.children[0] + ':' + new_left_idx.children[1]
 if visitor.right_tree != None:
    new_right_idx = IndexTranslator('right').transform(visitor.right_tree)
    new_right_idx = new_right_idx.children[0] + ':' + new_right_idx.children[1]
 print("df["+new_left_idx+","+new_right_idx+"]")
	# dependencies:
	# - Python3.6
	# - Lark: pip install lark-parser

	import sys
	import os.path
	from lark import Lark, Transformer, Visitor
	from lark import Tree
	# https://github.com/lark-parser/lark/blob/master/lark/grammars/common.lark

	pandas_grammar = """
	start: subset+ -> exprs
	data: "df" -> df
	subset: data (rows \| cols \| iloc)

	rows: "[" _index "]"
	cols: "[[" label ("," label)* "]]"
	iloc: "." "iloc" (rows \| _rows_cols)

	_rows_cols: "[" left ("," right?)? "]"
	left: _index
	right: _index

	_index: range \| NUMBER
	range: start_idx ":" end_idx
	start_idx: NUMBER*
	end_idx: NUMBER*
	label: "'" WORD "'"

	%import common.LETTER
	%import common.INT -> NUMBER
	%import common.WORD
	%import common.WS
	%ignore WS
	"""
	rows = """
	df[0] df[0:1] df[10:100] df[:1] df[1:]
	"""
	cols = """
	df[['a']] df[['aaa']] df[['a', 'b', 'c']] df[['aaa', 'bbb', 'ccc']]
	"""
	iloc_rows = """
	df.iloc[1] df.iloc[1, ] df.iloc[:] df.iloc[0:] df.iloc[:1]
	df.iloc[0:1] df.iloc[0:1, 0:1] df.iloc[0:, 0:] df.iloc[:1, :1]
	"""

	parser = Lark(pandas_grammar, keep_all_tokens=False)
	parser_ast = Lark(pandas_grammar)
	print('\nrows')
	print(parser.parse(rows).pretty())
	print('\ncols')
	print(parser.parse(cols).pretty())
	print('\niloc')
	print(parser.parse(iloc_rows).pretty())

	class LeftRightVisitor(Visitor):

	left_tree = None
	right_tree = None

	def left(self, tree):
	self.left_tree = tree.children[0]
	print(tree.data, tree)

	def right(self, tree):
	self.right_tree = tree.children[0]
	print(tree.data, tree)

	text = "df.iloc[:,]"
	parse_tree = parser.parse(text)
	print(parse_tree.pretty())
	visitor = LeftRightVisitor()
	visitor.visit(parse_tree)

	print('-----')

	class IndexTranslator(Transformer):

	def __init__(self, side, args, *kwargs):
	self.side = side

	def start_idx(self, matches):
	return str(int(matches[0]) + 1) if len(matches) == 1 else '1'

	def end_idx(self, matches):
	if len(matches) == 1:
	return matches[0]
	elif self.side == 'left':
	return 'nrow(df)'
	elif self.side == 'right':
	return 'ncol(df)'

	# TODO put this into a function for handling iloc (maybe it can handle regular ones too)
	new_idx = ''
	new_left_idx = ''
	new_right_idx = ''
	if visitor.left_tree == None and visitor.right_tree == None:
	new_idx = ''
	if visitor.left_tree != None:
	new_left_idx = IndexTranslator('left').transform(visitor.left_tree)
	new_left_idx = new_left_idx.children[0] + ':' + new_left_idx.children[1]
	if visitor.right_tree != None:
	new_right_idx = IndexTranslator('right').transform(visitor.right_tree)
	new_right_idx = new_right_idx.children[0] + ':' + new_right_idx.children[1]
	print("df["+new_left_idx+","+new_right_idx+"]")