Created
August 26, 2019 02:23
-
-
Save jarble/968f9aaa136cb07db3defb2dddc20198 to your computer and use it in GitHub Desktop.
A simple adaptive parser in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import copy | |
import sys | |
import subprocess | |
from tokenize import tokenize | |
from io import BytesIO | |
def tokenize_input(the_input): | |
return ([a.string for a in list(tokenize(BytesIO(the_input.encode('utf-8')).readline)) if a.string not in ["\n","","utf-8"]]) | |
def matches_pattern(string,pattern): | |
if len(string) != len(pattern): | |
return False | |
for i in range(0,len(string)): | |
if is_expr_(pattern[i]): | |
for j in range(0,len(string)): | |
if pattern[i] == pattern[j] and string[i] != string[j]: | |
return False | |
elif string[i] != pattern[i]: | |
return False | |
return True | |
def rewrite_rule(input_string,input_pattern,output_pattern): | |
output_string = copy.deepcopy(output_pattern) | |
if not matches_pattern(input_string,input_pattern): | |
return False | |
for i in range(0, len(input_pattern)): | |
if input_pattern[i][0].islower() and input_pattern[i] != input_string[i]: | |
return False | |
else: | |
for j in range(0,len(output_pattern)): | |
if output_pattern[j] == input_pattern[i]: | |
output_string[j] = input_string[i] | |
return output_string | |
def replace_matching_subsequence(input_string,input_pattern,output_pattern): | |
output_string = copy.deepcopy(input_string) | |
for i in range(0,len(input_string)-len(input_pattern)+1): | |
output_substring = rewrite_rule(input_string[i:len(input_pattern)+i],input_pattern,output_pattern) | |
if output_substring != False: | |
if type_of(output_substring) != None: | |
output_string[i : i+len(input_pattern)] = [output_substring] | |
else: | |
output_string[i : i+len(input_pattern)] = output_substring | |
return output_string | |
return False | |
def matches_patterns(a,patterns): | |
for pattern in patterns: | |
if matches_pattern(a,pattern): | |
return True | |
return False | |
types = {} | |
def simplify_pattern(pattern): | |
if type(pattern) == list: | |
return pattern | |
output = [] | |
for current in pattern.split(" "): | |
if current == ":number": | |
output += [is_number] | |
elif current == ":string": | |
output += [is_string] | |
elif current[0] == ":" and (current[1:len(current)] in ["bool","set","list","imperative"]): | |
output += [is_type_pattern(current[1:len(current)])] | |
elif current == ":expr": | |
output += [is_expr] | |
else: | |
output += [current] | |
return output | |
def simplify_patterns(patterns): | |
return [simplify_pattern(x) for x in [b.strip() for b in patterns.strip().split("\n")]] | |
def is_expr_(a): | |
if type(a) == str and (a[0].isupper()): | |
if not a in types: | |
types[a] = "expr" | |
return True | |
return False | |
def is_expr(a): | |
return type_of(a) != None | |
def matches_type_pattern(a,pattern): | |
if(len(a) != len(pattern)): | |
return False | |
elif type(a) == str and a == pattern: | |
return True | |
for i in range(0,len(a)): | |
if(type(pattern[i]) == str and pattern[i] != a[i] or callable(pattern[i]) and not pattern[i](a[i])): | |
return False | |
return True | |
global type_patterns | |
type_patterns = {} | |
def matches_type_patterns(a,patterns): | |
for pattern in patterns: | |
if matches_type_pattern(a,pattern): | |
return True | |
return False | |
def is_number(a): | |
global type_patterns | |
if (type(a) == str and (a.isdigit() or a[0].isupper())) or matches_type_patterns(a,type_patterns["number"]): | |
if(type(a) == str): | |
types[a] = "number" | |
return True | |
else: | |
return False | |
def is_string(a): | |
global type_patterns | |
if (type(a) == str and (a[0] == "\"")) or matches_type_patterns(a,type_patterns["string"]): | |
if(type(a) == str): | |
types[a] = "string" | |
return True | |
else: | |
return False | |
def is_type_pattern(the_type): | |
def to_return(a): | |
global type_patterns | |
if ( | |
is_expr_(a) | |
or matches_type_patterns(a,type_patterns[the_type]) | |
): | |
if(type(a) == str): | |
types[a] = the_type | |
return True | |
else: | |
return False | |
return to_return | |
type_patterns["list"] = simplify_patterns(''' | |
:expr in :list | |
[ :expr for :expr in :list ] | |
[ :expr for :expr in :list if :bool ] | |
type of :expr | |
dot product of :list and :list | |
cross product of :list and :list | |
determinant of :list | |
largest number in :list | |
smallest number in :list | |
list ( itertools . permutations ( :list ) ) | |
:list [ :: -1 ] | |
sorted ( :list ) | |
:list + :list | |
''') | |
type_patterns["number"] = simplify_patterns(''' | |
- :number | |
Math . PI | |
:number % :number | |
:number + :number | |
:number - :number | |
:number * :number | |
:number / :number | |
:number ** :number | |
maximize :number subject to :bool | |
absolute value of :number | |
sqrt ( :number ) | |
cbrt :number | |
diff ( :number , :number ) | |
integral of :number with respect to :number | |
integral of :number | |
integral of :number from :number to :number | |
integral of :number with respect to :number from :number to :number | |
greatest common factor of :number and :number | |
all solutions of :number | |
sin ( :number ) | |
cos ( :number ) | |
tan ( :number ) | |
ln :number | |
limit of :number as :number -> :number | |
domain of the function :number | |
range of the function :number | |
a random number between :number and :number | |
distance from :list to :list | |
''') | |
type_patterns["bool"] = simplify_patterns(''' | |
:string means :string | |
:number == :number | |
forall :expr ( :bool ) | |
:bool == :bool | |
:list == :list | |
:set == :set | |
:number != :number | |
:bool != :bool | |
:list != :list | |
:set != :set | |
:number > :number | |
:number < :number | |
:number >= :number | |
:number <= :number | |
:number is a number | |
:number is a string | |
:number is a list | |
:bool or :bool | |
:bool and :bool | |
issubsetof ( :set , :set ) | |
there exists :expr such that :bool | |
true | |
false | |
''') | |
type_patterns["set"] = simplify_patterns(''' | |
intersection of :set and :set | |
union of :set and :set''') | |
type_patterns["string"] = simplify_patterns(''' | |
substring of :string from :number to :number | |
longest match of the regex :string in the string :string | |
:string is a substring of :string | |
:string matches the regex :string | |
:replace :string in the string :string with :string''') | |
type_patterns["imperative"] = simplify_patterns(''' | |
console . log ( :expr ) ; | |
while ( :bool ) { :imperative } | |
var :expr = :expr ; | |
:expr = :expr ; | |
input ( :string ) ; | |
return :expr ; | |
if :bool then :imperative | |
if :bool then :imperative else :imperative | |
''') | |
def pattern_to_string(pattern): | |
str1 = "" | |
num = 0 | |
for i in pattern: | |
if(callable(i)): | |
num += 1 | |
str1 += " A" + str(num) | |
else: | |
str1 += " " + i | |
return str1 + " --> " + str1 | |
rewrite_rule_string = "" | |
for current in type_patterns.values(): | |
for i in current: | |
rewrite_rule_string += "\n" + pattern_to_string(i) | |
global rewrite_rules | |
global synonyms | |
synonyms = [] | |
rewrite_rules = [[list(filter(None,i.split(" "))),list(filter(None,j.split(" ")))] for [i,j] in [a.split("-->") for a in [b.strip() for b in | |
(rewrite_rule_string+''' | |
first A letters of B --> substring of B from 0 to A | |
A is a substring of B --> A is a substring of B | |
A is a superset of B --> A is a subset of B | |
A contains B --> B is in A | |
the string A contains the string B --> B is a substring of A | |
A is more than B --> A > B | |
A and B --> A and B | |
A squared --> A to the power of 2 | |
A cubed --> A to the power of 2 | |
although --> though | |
greater --> more | |
but --> and | |
A though B --> A but B | |
A is greater than or equal to B --> A >= B | |
A is not equal to B --> A != B | |
A is less than or equal to B --> A <= B | |
A divided by B --> A / B | |
A is less than B --> B > A | |
A is C than B and D than E --> ( A is C than B ) and ( A is D than E ) | |
( A is C than B ) and D than E --> ( A is C than B ) and ( A is D than E ) | |
A minus B --> A - B | |
sum of A and B --> A plus B | |
A is not more than B --> A <= B | |
A is not less than B --> A >= B | |
A is not one of B --> A is not in B | |
A is not in B --> (A is in B) == false | |
( A ) --> A | |
∞ --> infinity | |
sum of A and B --> A + B | |
A must be B --> A == B | |
A must not be B --> A != B | |
A cannot be B --> A != B | |
the A --> A | |
A is not B --> ( A != B ) | |
A is neither B nor C --> ( A is not B ) and ( A is not C ) | |
A is not a B --> ( A is a B ) == false | |
A multiplied by B --> A * B | |
A to the power of B --> A ** B | |
[ A for A in B where D ] --> [ A for A in B if D ] | |
[ A in B where D ] --> [ A for A in B where D ] | |
every number in L that is greater than B --> [ a in B where a > B ] | |
all permutations of X --> list ( itertools . permutations ( X ) ) | |
every permutation of X --> all permutations of X | |
product of A and B --> A * B | |
A mod B --> A % B | |
A is divisible by B --> ( A % B ) == 0 | |
A is not X by Y --> ( A is X by Y ) == false | |
A is not a number --> A is not a number | |
if A then B otherwise C --> if A then B else C | |
not A --> ( A == false ) | |
unless A then B otherwise C --> if ( not A ) then B else C | |
A is a match of the regex B --> A matches the regex B | |
the string A matches the regex B --> A is a match of the regex B | |
A are in B --> A is a subset of B | |
plus --> + | |
A is not a B of C --> ( A is a B of C ) == false | |
none of A are in B --> A is not a subset of B | |
derivative of A with respect to B --> diff ( A , B ) | |
A percent of B --> A * 0.10 * B | |
A does not equal B --> A != B | |
A ≠ B --> A != B | |
A if B --> if B then A | |
B implies A --> A if B | |
A unless B --> if ( not A ) then B | |
distance between A and B --> distance from A to B | |
sin A --> sin ( A ) | |
cos A --> cos ( A ) | |
tan A --> tan ( A ) | |
sine of A --> sin A | |
cosine of A --> cos A | |
tangent of A --> tan A | |
cotangent of A --> 1 / ( tan A ) | |
secant of A --> 1 / ( cos A ) | |
secant of A --> 1 / ( cos A ) | |
A = B --> A == B | |
=/= --> != | |
A ÷ B --> A / B | |
A % of B --> A percent of B | |
quotient of A and B --> A / B | |
A iff B --> ( A implies B ) and ( B implies A ) | |
⇔ --> iff | |
[ A | B <- C ] --> [ A for B in C ] | |
| A | --> absolute value of A | |
square root of A --> sqrt ( A ) | |
cube root of A --> cbrt A | |
^ --> ** | |
one --> 1 | |
two --> 2 | |
three --> 3 | |
four --> 4 | |
five --> 5 | |
six --> 6 | |
seven --> 7 | |
eight --> 8 | |
nine --> 9 | |
ten --> 10 | |
minimize A subject to B --> maximize A subject to - ( B ) | |
maximize A so that B --> maximize A subject to B | |
maximize A such that B --> maximize A so that B | |
maximize A where B --> maximize A so that B | |
A ∪ B --> union of A and B | |
A ∩ B --> intersection of A and B | |
A || B --> A or B | |
A && B --> A and B | |
larger --> greater | |
largest --> greatest | |
maximum --> greatest | |
minimum --> smallest | |
natural logarithm of A --> ln A | |
regex --> regular expression | |
¬ A --> not A | |
A ≥ B --> A >= B | |
A ≤ B --> A <= B | |
A ⊃ B --> A is a superset of B | |
A ⊂ B --> A is a subset of B | |
A is a subset of B --> issubsetof ( A , B ) | |
iff --> if and only if | |
A is implied by B --> B implies A | |
∀ A ( B ) --> forall A ( B ) | |
↔ --> iff | |
A if B --> if B then A | |
A ∨ B --> A or B | |
∃ A ( B ) --> there exists A such that B | |
if ( A ) { B } --> if A then B | |
A is any of B --> A is in B | |
A is in B --> A in B | |
A equals B --> A == B | |
A times B --> A * B | |
A is a factor of B --> B is divisible by A | |
longest substring of A that matches the regex B --> longest match of the regex B in the string A | |
A is B --> A == B | |
A reversed --> A [ :: -1 ] | |
A spelled backwards --> A reversed | |
A sorted --> sorted ( A ) | |
A in alphabetical order --> sorted ( A ) | |
until A { B } --> while ( A == false ) { B } | |
A while B --> while ( B ) { A } | |
! A --> A == false | |
A := B --> A = B ; | |
set A to B --> A := B | |
initialize A to B --> var A = B ; | |
A += B --> A := A + B | |
A -= B --> A := A - B | |
A *= B --> A := A * B | |
A /= B --> A := A / B | |
add A to B --> B += A | |
divide B by A --> B /= A | |
subtract B from A --> A -= B | |
multiply A by B --> A *= B | |
let A = B --> initialize A to B | |
let A be B --> let A = B | |
A until B --> until B { A } | |
print A --> console . log ( A ) ; | |
π --> Math . PI | |
pi --> π | |
if A { B } --> if A then B | |
while A { B } --> while ( A ) { B } | |
''').strip().split("\n")]]] | |
for current in rewrite_rules: | |
if len(current[0]) == 1 and current[0] != current[1]: | |
synonyms += [[current[0],current[1]]] | |
#print(synonyms) | |
def token_with_whitespace(a): | |
return a[0] == "\"" and a[1].isalpha() or a[0].isalpha() | |
def join_tokens(token_list): | |
result = "" | |
for i in range(0,len(token_list)): | |
if i == 0: | |
result += token_list[i] | |
elif token_with_whitespace(token_list[i]) and token_with_whitespace(token_list[i-1]): | |
result += " __ " + token_list[i] | |
else: | |
result += " _ " + token_list[i] | |
return result | |
def get_synonyms_list(synonym): | |
global synonyms | |
to_return = [[synonym]] | |
changed = True | |
while changed: | |
changed = False | |
for current in synonyms: | |
if current[1] in to_return and not current[0] in to_return: | |
to_return += [current[0]] | |
changed = True | |
elif current[0] in to_return and not current[1] in to_return: | |
to_return += [current[1]] | |
changed = True | |
to_return = [["\""+b+"\"" for b in a] for a in to_return] | |
if len(to_return) == 1: | |
return to_return[0][0] | |
#print(to_return) | |
return "("+" | ".join([join_tokens(a) for a in to_return])+")" | |
#print(get_synonyms_list("though")) | |
def do_action(output): | |
global rewrite_rules | |
if len(output) == 3 and output[1] == "means": | |
rewrite_rules += [[list(filter(None,output[0][1:len(output[0])-1].split(" "))),list(filter(None,output[2][1:len(output[2])-1].split(" ")))]] | |
print([list(filter(None,output[0][1:len(output[0])-1].split(" "))),list(filter(None,output[2][1:len(output[2])-1].split(" ")))]) | |
print(rewrite_rules[0]) | |
else: | |
print(str(output)) | |
def recursive_rewrite(input_string): | |
global rewrite_rules | |
outputs = [input_string] | |
changed = True | |
while changed: | |
changed = False | |
for current in outputs: | |
new_outputs = [] | |
for [a,b] in rewrite_rules: | |
output = replace_matching_subsequence(current,a,b) | |
if output != False and output not in outputs and output not in new_outputs: | |
new_outputs += [output] | |
changed = True | |
if(len(output) == 1 and type_of(output[0]) != None): | |
do_action(output[0]) | |
return [output] | |
outputs = new_outputs | |
return [] | |
index_of = {} | |
def unparse(text): | |
if type(text) == str: | |
if text.isupper(): | |
return index_of[text] | |
else: | |
return "\""+text+"\""; | |
if type(text) == list: | |
return "("+" + \" \" + ".join([unparse(a) for a in text])+")" | |
else: | |
raise Exception("Cannot parse "+str(text)) | |
def type_of(a): | |
if is_expr_(a): | |
return "expr" | |
elif is_number(a): | |
return "number" | |
else: | |
for current in type_patterns.keys(): | |
if not current in ["number","expr"] and is_type_pattern(current)(a): | |
return current | |
thing = "hello" | |
print(thing and thing[0].isalpha()) | |
#subprocess.run("python -m lark.tools.nearley parser.ne main nearley > parser.py".split(" "),shell=True) | |
#subprocess.run("nearleyc parser.ne -o parser.js".split(" "),shell=True) | |
import parser | |
#print(parser.parse("A is not less than B")) | |
print("type something to test the parser, or type exit to quit") | |
text_input = "" | |
for current in [tokenize_input(b.strip()) for b in (''' | |
"while A do B end" means "while A { B }" | |
"if A then B end" means "if A { B }" | |
"A <- 1" means "A := 1" | |
"while A < B do C endwhile" means "while A < B do C end" | |
"A is not B" means "A != B" | |
"A is not B" means "A == ( not B )" | |
"A / B of C" means "C * ( A / B )" | |
"half" means "1 / 2" | |
"understand A as B" means "A means B" | |
understand "twice A" as "A * 2" | |
understand "the meaning of A is B" as "A means B" | |
the meaning of "thrice A" is "A * 3" | |
"twelve" means "12" | |
"five" means "five" | |
"nine" means "9" | |
"A % of B" means "B * ( A / 100 )" | |
"[every A in B such that D]" means "[ A for A in B where D ]" | |
'''.strip().split("\n"))]: | |
recursive_rewrite(current) | |
while text_input != "exit": | |
text_input = input() | |
print(recursive_rewrite(tokenize_input(text_input))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment