Skip to content

Instantly share code, notes, and snippets.

@TimelessP
Created February 13, 2025 20:11
Show Gist options
  • Save TimelessP/1a307f9b23fe0fa83dde581f8fa80c12 to your computer and use it in GitHub Desktop.
Save TimelessP/1a307f9b23fe0fa83dde581f8fa80c12 to your computer and use it in GitHub Desktop.
larking about
from lark import Lark
COBOL_GRAMMAR = r"""
// Lark grammar for COBOL DIVISIONS with USING/RETURNING in PROCEDURE DIVISION
?start: divisions
divisions: division+
// A division consists of a header followed by a body.
division: division_header division_body
// A division header is one of the standard headers.
division_header: identification_division
| environment_division
| data_division
| procedure_division
identification_division: IDENTIFICATION_DIVISION
environment_division: ENVIRONMENT_DIVISION
data_division: DATA_DIVISION
// The PROCEDURE DIVISION header may include optional USING and RETURNING clauses.
procedure_division: procedure_division_header
procedure_division_header: PROCEDURE_DIVISION_KEYWORD using_clause? returning_clause? "."
PROCEDURE_DIVISION_KEYWORD: /(?i)PROCEDURE\s+DIVISION/
using_clause: /\s+USING\s+/ parameter_list
returning_clause: /\s+RETURNING\s+/ identifier
// A parameter list is a comma-separated list of identifiers.
parameter_list: identifier (COMMA identifier)*
COMMA: ","
// The division body may contain both sections and loose paragraphs.
division_body: (section | paragraph)*
// A section starts with a section header and must contain at least one paragraph.
section: section_header paragraph+
section_header: /(?i)[A-Z][A-Z0-9\-]*\s+SECTION\./
// A paragraph may have an optional heading (a label ending with a period)
// followed by one or more sentences.
paragraph: paragraph_heading? sentence+
paragraph_heading: identifier "."
// A sentence is defined as a block of text terminated by a period.
sentence: sentence_body "."
sentence_body: /[^.]+/
// Identifiers: assume an alphanumeric token starting with a letter (and may include dashes).
identifier: IDENTIFIER
IDENTIFIER: /[A-Z][A-Z0-9-]*/i
// Standard COBOL division headers.
IDENTIFICATION_DIVISION: /(?i)IDENTIFICATION\s+DIVISION\./
ENVIRONMENT_DIVISION: /(?i)ENVIRONMENT\s+DIVISION\./
DATA_DIVISION: /(?i)DATA\s+DIVISION\./
%import common.WS
%ignore WS
"""
def main():
# Extended COBOL code sample that includes all division types
sample_code = r"""
IDENTIFICATION DIVISION.
PROGRAM-ID. HELLO-WORLD.
AUTHOR. JOHNDOE.
ENVIRONMENT DIVISION.
CONFIGURATION SECTION.
SOURCE-COMPUTER. IBM-370.
OBJECT-COMPUTER. IBM-370.
INPUT-OUTPUT SECTION.
FILE-CONTROL.
SELECT INFILE ASSIGN TO 'INPUT.DAT'.
SELECT OUTFILE ASSIGN TO 'OUTPUT.DAT'.
DATA DIVISION.
WORKING-STORAGE SECTION.
WS-VAR.
PIC 9(4) VALUE 0.
LINKAGE SECTION.
LS-VAR.
PIC X(10).
PROCEDURE DIVISION USING ARG1, ARG2 RETURNING RESULT.
MAIN-PARAGRAPH.
DISPLAY "Hello, World".
PERFORM PROCESS-DATA.
STOP RUN.
PROCESS-DATA.
DISPLAY "Processing Data".
"""
# Create a Lark parser instance using the COBOL grammar.
parser = Lark(COBOL_GRAMMAR, start='start', parser='earley')
# Parse the extended COBOL code.
try:
tree = parser.parse(sample_code)
print("Parse tree:")
print(tree.pretty())
except Exception as e:
print("Parsing failed:")
print(e)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment