Last active
May 8, 2022 03:33
-
-
Save VideoCarp/71eeb6cabadc6f450ede32cd6ebf21c2 to your computer and use it in GitHub Desktop.
A lexer for the English language written in F#.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open System | |
open System.Text | |
// Helper functions (SEE End of file for details) | |
let numeric c = (c >= '0' && c <= '9') | |
let ofword c = | |
(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c = '\'') | |
let str c = (c = '"') // fix highlighting syntax " | |
let notstr c = (c <> '"') // " | |
// Lexer | |
let rec lexer (input: string) = | |
let len = input.Length | |
let rec lex start = | |
if start >= len then [] | |
else | |
let grapheme = input.[start] | |
let inline atk tk = tk :: lex (start + 1) // addtoken | |
let inline abtk cond tag skip = // addbigtoken | |
let rec findend i = | |
if (i < len) && (cond(input.[i])) then findend (i + 1) | |
else i - 1 | |
let tkstart = (start + skip) | |
let tkend = findend tkstart | |
(input.[tkstart..tkend], tag) :: lex (tkend + skip + 1) | |
match grapheme with | |
| '(' -> atk ("(", "oparen") | |
| ')' -> atk (")", "cparen") | |
| '.' -> atk (".", "stop") | |
| ',' -> atk (",", "comma") | |
| ';' -> atk (";", "semicolon") | |
| ':' -> atk (":", "colon") | |
| '=' -> atk ("=", "equals") | |
| '-' -> atk ("-", "en dash") | |
| '—' -> atk ("—", "em dash") | |
| '!' -> atk ("!", "exclamation") | |
| '?' -> atk ("?", "question") | |
| '\'' -> atk ("'", "apostrophe") | |
| '…' -> atk ("…", "ellipsis") | |
| _ when ofword grapheme -> abtk ofword "word" 0 | |
| _ when numeric grapheme -> abtk numeric "number" 0 | |
| _ when str grapheme -> abtk notstr "quote" 1 | |
| _ -> lex (start + 1) | |
lex 0 | |
// Test | |
printf "Program: " | |
let program = (Console.ReadLine() |> IO.File.ReadAllText) | |
let tokens = lexer program | |
tokens |> Seq.iter (printfn "%A") | |
(* | |
----*********************************************************************************************** | |
* This is a lexical analyser written by hand in F#, designed to analyse the English language. | |
* It cannot handle all the features of the language, but it can handle most of them. | |
* It can be utilised as a basis for basic natural language processing and as a | |
* template for lexical analysers. | |
* Benchmarks show it's pretty fast. It took me 48 ms on average to process 100,000 characters | |
* that total up to 25,000 strings. I ran the test 100 times using Diagnostics.Stopwatch. | |
----*********************************************************************************************** | |
*) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment