Skip to content

Instantly share code, notes, and snippets.

@jarble
Last active August 11, 2019 17:41
Show Gist options
  • Save jarble/3eb4abaea6c7b4cbcf4bdafc5a6403fa to your computer and use it in GitHub Desktop.
Save jarble/3eb4abaea6c7b4cbcf4bdafc5a6403fa to your computer and use it in GitHub Desktop.
This Python script generates a "pseudocode parser" for the Nearley parser generator. It's based on an adaptive parsing algorithm or "adaptive grammar."
import copy
import sys
import subprocess
def matches_pattern(string,pattern):
if len(string) != len(pattern):
return False
for i in range(0,len(string)):
if is_expr_(pattern[i]):
for j in range(0,len(string)):
if pattern[i] == pattern[j] and string[i] != string[j]:
return False
elif string[i] != pattern[i]:
return False
return True
def rewrite_rule(input_string,input_pattern,output_pattern):
output_string = copy.deepcopy(output_pattern)
if not matches_pattern(input_string,input_pattern):
return False
for i in range(0, len(input_pattern)):
if input_pattern[i][0].islower() and input_pattern[i] != input_string[i]:
return False
else:
for j in range(0,len(output_pattern)):
if output_pattern[j] == input_pattern[i]:
output_string[j] = input_string[i]
return output_string
def replace_matching_subsequence(input_string,input_pattern,output_pattern):
output_string = copy.deepcopy(input_string)
for i in range(0,len(input_string)-len(input_pattern)+1):
output_substring = rewrite_rule(input_string[i:len(input_pattern)+i],input_pattern,output_pattern)
if output_substring != False:
if type_of(output_substring) != None:
output_string[i : i+len(input_pattern)] = [output_substring]
else:
output_string[i : i+len(input_pattern)] = output_substring
return output_string
return False
def matches_patterns(a,patterns):
for pattern in patterns:
if matches_pattern(a,pattern):
return True
return False
types = {}
def simplify_pattern(pattern):
if type(pattern) == list:
return pattern
output = []
for current in pattern.split(" "):
if current == ":number":
output += [is_number]
elif current[0] == ":" and (current[1:len(current)] in ["bool","string","set","list","imperative"]):
output += [is_type_pattern(current[1:len(current)])]
elif current == ":expr":
output += [is_expr]
else:
output += [current]
return output
def simplify_patterns(patterns):
return [simplify_pattern(x) for x in [b.strip() for b in patterns.strip().split("\n")]]
def is_expr_(a):
if type(a) == str and (a[0].isupper()):
if not a in types:
types[a] = "expr"
return True
return False
def is_expr(a):
return type_of(a) != None
def matches_type_pattern(a,pattern):
if(len(a) != len(pattern)):
return False
elif type(a) == str and a == pattern:
return True
for i in range(0,len(a)):
if(type(pattern[i]) == str and pattern[i] != a[i] or callable(pattern[i]) and not pattern[i](a[i])):
return False
return True
global type_patterns
type_patterns = {}
def matches_type_patterns(a,patterns):
for pattern in patterns:
if matches_type_pattern(a,pattern):
return True
return False
def is_number(a):
global type_patterns
if (type(a) == str and (a.isdigit() or a[0].isupper())) or matches_type_patterns(a,type_patterns["number"]):
if(type(a) == str):
types[a] = "number"
return True
else:
return False
def is_type_pattern(the_type):
def to_return(a):
global type_patterns
if (
is_expr_(a)
or matches_type_patterns(a,type_patterns[the_type])
):
if(type(a) == str):
types[a] = the_type
return True
else:
return False
return to_return
type_patterns["list"] = simplify_patterns('''
:expr in :list
[ :expr for :expr in :list ]
[ :expr for :expr in :list if :bool ]
type of :expr
dot product of :list and :list
cross product of :list and :list
determinant of :list
largest number in :list
smallest number in :list
list ( itertools . permutations ( :list ) )
:list [ :: -1 ]
sorted ( :list )
:list + :list
''')
type_patterns["number"] = simplify_patterns('''
- :number
Math . PI
:number % :number
:number + :number
:number - :number
:number * :number
:number / :number
:number ** :number
maximize :number subject to :bool
absolute value of :number
sqrt ( :number )
cbrt :number
diff ( :number , :number )
integral of :number with respect to :number
integral of :number
integral of :number from :number to :number
integral of :number with respect to :number from :number to :number
greatest common factor of :number and :number
all solutions of :number
sin ( :number )
cos ( :number )
tan ( :number )
ln :number
limit of :number as :number -> :number
domain of the function :number
range of the function :number
a random number between :number and :number
distance from :list to :list
''')
type_patterns["bool"] = simplify_patterns('''
:number == :number
forall :expr ( :bool )
:bool == :bool
:list == :list
:set == :set
:number != :number
:bool != :bool
:list != :list
:set != :set
:number > :number
:number < :number
:number >= :number
:number <= :number
:number is a number
:number is a string
:number is a list
:bool or :bool
:bool and :bool
issubsetof ( :set , :set )
there exists :expr such that :bool
true
false
''')
type_patterns["set"] = simplify_patterns('''
intersection of :set and :set
union of :set and :set''')
type_patterns["string"] = simplify_patterns('''
substring of :string from :number to :number
longest match of the regex :string in the string :string
:string is a substring of :string
:string matches the regex :string
:replace :string in the string :string with :string''')
type_patterns["imperative"] = simplify_patterns('''
console . log ( :expr ) ;
while ( :bool ) { :imperative }
var :expr = :expr ;
:expr = :expr ;
input ( :string ) ;
return :expr ;
if :bool then :imperative
if :bool then :imperative else :imperative
''')
def pattern_to_string(pattern):
str1 = ""
num = 0
for i in pattern:
if(callable(i)):
num += 1
str1 += " A" + str(num)
else:
str1 += " " + i
return str1 + " --> " + str1
rewrite_rule_string = ""
for current in type_patterns.values():
for i in current:
rewrite_rule_string += "\n" + pattern_to_string(i)
global rewrite_rules
global synonyms
synonyms = []
rewrite_rules = [[list(filter(None,i.split(" "))),list(filter(None,j.split(" ")))] for [i,j] in [a.split("-->") for a in [b.strip() for b in
(rewrite_rule_string+'''
first A letters of B --> substring of B from 0 to A
A is a substring of B --> A is a substring of B
A is a superset of B --> A is a subset of B
A contains B --> B is in A
the string A contains the string B --> B is a substring of A
A is more than B --> A > B
A and B --> A and B
A squared --> A to the power of 2
A cubed --> A to the power of 2
although --> though
greater --> more
but --> and
A though B --> A but B
A is greater than or equal to B --> A >= B
A is not equal to B --> A != B
A is less than or equal to B --> A <= B
A divided by B --> A / B
A is less than B --> B > A
A is C than B and D than E --> ( A is C than B ) and ( A is D than E )
( A is C than B ) and D than E --> ( A is C than B ) and ( A is D than E )
A minus B --> A - B
sum of A and B --> A plus B
A is not more than B --> A <= B
A is not less than B --> A >= B
A is not one of B --> A is not in B
A is not in B --> (A is in B) == false
( A ) --> A
∞ --> infinity
sum of A and B --> A + B
A must be B --> A == B
A must not be B --> A != B
A cannot be B --> A != B
the A --> A
A is not B --> ( A != B )
A is neither B nor C --> ( A is not B ) and ( A is not C )
A is not a B --> ( A is a B ) == false
A multiplied by B --> A * B
A to the power of B --> A ** B
[ A for A in B where D ] --> [ A for A in B if D ]
[ A in B where D ] --> [ A for A in B where D ]
every number in L that is greater than B --> [ a in B where a > B ]
all permutations of X --> list ( itertools . permutations ( X ) )
every permutation of X --> all permutations of X
product of A and B --> A * B
A mod B --> A % B
A is divisible by B --> ( A % B ) == 0
A is not X by Y --> ( A is X by Y ) == false
A is not a number --> A is not a number
if A then B otherwise C --> if A then B else C
not A --> ( A == false )
unless A then B otherwise C --> if ( not A ) then B else C
A is a match of the regex B --> A matches the regex B
the string A matches the regex B --> A is a match of the regex B
A are in B --> A is a subset of B
plus --> +
A is not a B of C --> ( A is a B of C ) == false
none of A are in B --> A is not a subset of B
derivative of A with respect to B --> diff ( A , B )
A percent of B --> A * 0.10 * B
A does not equal B --> A != B
A ≠ B --> A != B
A if B --> if B then A
B implies A --> A if B
A unless B --> if ( not A ) then B
distance between A and B --> distance from A to B
sin A --> sin ( A )
cos A --> cos ( A )
tan A --> tan ( A )
sine of A --> sin A
cosine of A --> cos A
tangent of A --> tan A
cotangent of A --> 1 / ( tan A )
secant of A --> 1 / ( cos A )
secant of A --> 1 / ( cos A )
A = B --> A == B
=/= --> !=
A ÷ B --> A / B
A % of B --> A percent of B
quotient of A and B --> A / B
A iff B --> ( A implies B ) and ( B implies A )
⇔ --> iff
[ A | B <- C ] --> [ A for B in C ]
| A | --> absolute value of A
square root of A --> sqrt ( A )
cube root of A --> cbrt A
^ --> **
one --> 1
two --> 2
three --> 3
four --> 4
five --> 5
six --> 6
seven --> 7
eight --> 8
nine --> 9
ten --> 10
minimize A subject to B --> maximize A subject to - ( B )
maximize A so that B --> maximize A subject to B
maximize A such that B --> maximize A so that B
maximize A where B --> maximize A so that B
A ∪ B --> union of A and B
A ∩ B --> intersection of A and B
A || B --> A or B
A && B --> A and B
larger --> greater
largest --> greatest
maximum --> greatest
minimum --> smallest
natural logarithm of A --> ln A
regex --> regular expression
¬ A --> not A
A ≥ B --> A >= B
A ≤ B --> A <= B
A ⊃ B --> A is a superset of B
A ⊂ B --> A is a subset of B
A is a subset of B --> issubsetof ( A , B )
iff --> if and only if
A is implied by B --> B implies A
∀ A ( B ) --> forall A ( B )
↔ --> iff
A if B --> if B then A
A ∨ B --> A or B
∃ A ( B ) --> there exists A such that B
if ( A ) { B } --> if A then B
A is any of B --> A is in B
A is in B --> A in B
A equals B --> A == B
A times B --> A * B
A is a factor of B --> B is divisible by A
longest substring of A that matches the regex B --> longest match of the regex B in the string A
A is B --> A == B
A reversed --> A [ :: -1 ]
A spelled backwards --> A reversed
A sorted --> sorted ( A )
A in alphabetical order --> sorted ( A )
until A { B } --> while ( A == false ) { B }
A while B --> while ( B ) { A }
! A --> A == false
A := B --> A = B ;
set A to B --> A := B
initialize A to B --> var A = B ;
A += B --> A := A + B
A -= B --> A := A - B
A *= B --> A := A * B
A /= B --> A := A / B
add A to B --> B += A
divide B by A --> B /= A
subtract B from A --> A -= B
multiply A by B --> A *= B
let A = B --> initialize A to B
let A be B --> let A = B
A until B --> until B { A }
print A --> console . log ( A ) ;
π --> Math . PI
pi --> π
if A { B } --> if A then B
while A { B } --> while ( A ) { B }
while A do B end --> while A { B }
if A then B end --> if A { B }
A <- 1 --> A := 1
while A < B do C endwhile --> while A < B do C end
''').strip().split("\n")]]]
for current in rewrite_rules:
if len(current[0]) == 1 and current[0] != current[1]:
is_added = False
if is_added == False:
synonyms += [[current[0],current[1]]]
#print(synonyms)
def token_with_whitespace(a):
return a[0] == "\"" and a[1].isalpha() or a[0].isalpha()
def join_tokens(token_list):
result = ""
for i in range(0,len(token_list)):
if i == 0:
result += token_list[i]
elif token_with_whitespace(token_list[i]) and token_with_whitespace(token_list[i-1]):
result += " __ " + token_list[i]
else:
result += " _ " + token_list[i]
return result
def get_synonyms_list(synonym):
global synonyms
to_return = [[synonym]]
changed = True
while changed:
changed = False
for current in synonyms:
if current[1] in to_return and not current[0] in to_return:
to_return += [current[0]]
changed = True
elif current[0] in to_return and not current[1] in to_return:
to_return += [current[1]]
changed = True
to_return = [["\""+b+"\"" for b in a] for a in to_return]
if len(to_return) == 1:
return to_return[0][0]
#print(to_return)
return "("+" | ".join([join_tokens(a) for a in to_return])+")"
#print(get_synonyms_list("though"))
def recursive_rewrite(input_string):
global rewrite_rules
outputs = [input_string]
changed = True
while changed:
changed = False
for current in outputs:
new_outputs = []
for [a,b] in rewrite_rules:
output = replace_matching_subsequence(current,a,b)
if output != False and output not in outputs and output not in new_outputs:
new_outputs += [output]
changed = True
if(len(output) == 1 and type_of(output[0]) != None):
return [output]
outputs = new_outputs
return []
index_of = {}
def unparse(text):
if type(text) == str:
if text.isupper():
return index_of[text]
else:
return "\""+text+"\"";
if type(text) == list:
return "("+" + \" \" + ".join([unparse(a) for a in text])+")"
else:
raise Exception("Cannot parse "+str(text))
def type_of(a):
if is_expr_(a):
return "expr"
elif is_number(a):
return "number"
else:
for current in type_patterns.keys():
if not current in ["number","expr"] and is_type_pattern(current)(a):
return current
def print_grammar_rule(split_text):
rewritten = recursive_rewrite(split_text)
type_of(rewritten)
if rewritten == []:
return ""
input_pattern = []
for i in range(0,len(split_text)):
if is_expr_(split_text[i]):
index_of[split_text[i]] = "d["+str((i+1)*2-2)+"]"
input_pattern += [types[split_text[i]]]
else:
input_pattern += [get_synonyms_list(split_text[i])]
return type_of(rewritten[0][0])+" -> "+join_tokens(input_pattern)+"{% function(d) {return "+unparse(rewritten[0][0])+";}%}\n"
nearley_output_string = '''
start -> _ (statements | expr) _ {% function(d) {return d[1][0]; } %}
imperative -> "(" _ statements _ ")" {% function(d) {return d[2];}%}
statements -> imperative | statements ( _ | _ (";" | "and" | "and" __ "then") _ ) imperative {% function(d) {return d[0] + d[2]; } %}
expr --> (bool|number|list|set|string) {% function(d) {return d[0]; } %}
set -> _name {% function(d) {return d[0]; } %} | "(" _ set _ ")" {% function(d) {return "("+d[2]+")";}%}
list -> _name {% function(d) {return d[0]; } %} | "(" _ list _ ")" {% function(d) {return "("+d[2]+")";}%}
number -> _name {% function(d) {return d[0]; } %} | "(" _ number _ ")" {% function(d) {return "("+d[2]+")";}%}
bool -> _name {% function(d) {return d[0]; } %} | "(" _ bool _ ")" {% function(d) {return "("+d[2]+")";}%}
string -> _name {% function(d) {return d[0]; } %} | "(" _ string _ ")" {% function(d) {return "("+d[2]+")";}%}
expr -> _name {% function(d) {return d[0]; } %}
_name -> [a-zA-Z_] {% id %}
| _name [\w_] {% function(d) {return d[0] + d[1]; } %}
number -> _number {% function(d) {return d[0]} %}
_float ->
_int {% id %}
| _int "." _posint {% function(d) {return d[0] + d[1] + d[2]; }%}
_posint ->
[0-9] {% id %}
| _posint [0-9] {% function(d) {return d[0] + d[1]} %}
_int ->
"-" _posint {% function(d) {return d[0] + d[1]; }%}
| _posint {% id %}
_number ->
_float {% id %}
| _float "e" _int {% function(d){return d[0] + d[1] + d[2]; } %}
_ -> null | _ [\s] {% function() {} %}
__ -> [\s] | __ [\s] {% function() {} %}
'''
marpa_output_string = '''use strict;
use warnings;
use Marpa::R2;
use Data::Dump;'''
for current in rewrite_rules:
types = {}
nearley_output_string += print_grammar_rule(current[0])
print(nearley_output_string)
with open("parser.ne", "wb") as file:
file.write(nearley_output_string.encode('utf8'))
thing = "hello"
print(thing and thing[0].isalpha())
#subprocess.run("python -m lark.tools.nearley parser.ne main nearley > parser.py".split(" "),shell=True)
#subprocess.run("nearleyc parser.ne -o parser.js".split(" "),shell=True)
import parser
#print(parser.parse("A is not less than B"))
print("type something to test the parser, or type exit to quit")
text_input = ""
while text_input != "exit":
text_input = input()
print(recursive_rewrite(text_input.split(" ")))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment