-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlean_parser.py
More file actions
216 lines (182 loc) · 8.09 KB
/
lean_parser.py
File metadata and controls
216 lines (182 loc) · 8.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""Lightweight Lean 4 parser using Lark for extracting declarations and match cases."""
from lark import Lark, Transformer, v_args, Token
from dataclasses import dataclass, field
from typing import Optional
# Grammar: enough to parse top-level declarations and match case structure.
# NOT a full Lean parser — just enough for code archaeology.
LEAN_GRAMMAR = r"""
start: item*
?item: decl | namespace_block | section_block | open_cmd | set_option | import_cmd | OTHER_LINE
decl: modifiers decl_kind NAME sig decl_body
modifiers: (MODIFIER)*
decl_kind: "theorem" | "lemma" | "def" | "inductive" | "structure" | "instance" | "class" | "axiom" | "abbrev"
sig: sig_token*
decl_body: ":=" body_content
| "where" body_content
| /* empty */
body_content: (match_expr | other_body_token)*
match_expr: "match" match_tokens "with" match_cases
match_cases: match_case+
match_case: "|" case_pattern "=>" case_body
case_pattern: pattern_token+
case_body: (nested_match | case_token)*
nested_match: "match" match_tokens "with" match_cases
namespace_block: "namespace" NAME NEWLINE item* "end" NAME
section_block: "section" NAME? NEWLINE item*
open_cmd: "open" /[^\n]*/
set_option: "set_option" /[^\n]*/
import_cmd: "import" /[^\n]*/
// Tokens — permissive, we don't need to type-check
sig_token: /(?!:=|where)[^\n]/ | NEWLINE /\s+(?!theorem|lemma|def|inductive|structure|instance|class|end\s|namespace\s)/
match_tokens: /(?!with\b)[^\n]/ | NEWLINE /\s+(?!with\b)/
pattern_token: /(?!=>)[^\n|]/
case_token: /(?!\|(?:\s*[.«]|\s+_\s*=>))[^\n]/ | NEWLINE /\s+(?!\|)/
other_body_token: /(?!match\b|\|(?:\s*[.«]))[^\n]/ | NEWLINE /\s+(?!theorem|lemma|def|inductive|structure|instance|class|end\s|namespace\s)/
MODIFIER: "private" | "protected" | "noncomputable" | "partial" | "mutual" | "unsafe"
NAME: /[a-zA-Z_«][a-zA-Z0-9_.'»?]*/
OTHER_LINE: /[^\n]*/ NEWLINE
NEWLINE: /\n/
%ignore /[ \t]+/
%ignore /--[^\n]*/
"""
# The full lark grammar above is too ambitious for Lean's syntax.
# Instead, use a simpler two-pass approach:
# Pass 1: Split file into top-level declarations (by indentation)
# Pass 2: For each `def`, split match body into cases
def parse_lean_file(content: str) -> dict:
"""Parse a Lean file into declarations and their match cases.
Returns: {
'declarations': [{'kind', 'name', 'line', 'end_line', 'body', 'sorry', 'cases': {...}}],
}
"""
lines = content.splitlines()
decls = []
# Pass 1: Find declarations by matching unindented lines
i = 0
while i < len(lines):
line = lines[i]
# Match declaration keywords at start of line (possibly with modifiers)
stripped = line.lstrip()
indent = len(line) - len(stripped)
kind = None
name = None
for kw in ('theorem', 'lemma', 'def', 'inductive', 'structure', 'instance', 'class', 'axiom', 'abbrev'):
# Check with optional modifiers
for prefix in ('', 'private ', 'protected ', 'noncomputable ', 'partial ', 'private partial ', 'noncomputable partial '):
if stripped.startswith(prefix + kw + ' '):
kind = kw
rest = stripped[len(prefix + kw):].strip()
# Extract name (handle names with ? and «»)
name_chars = []
for ch in rest:
if ch in ' (:{\n':
break
name_chars.append(ch)
name = ''.join(name_chars)
break
if kind:
break
if not kind:
i += 1
continue
# Find end of declaration: next unindented declaration or end/namespace
start_line = i
i += 1
while i < len(lines):
if not lines[i].strip():
i += 1
continue
next_indent = len(lines[i]) - len(lines[i].lstrip())
if next_indent <= indent:
next_stripped = lines[i].lstrip()
is_decl = any(next_stripped.startswith(p + k + ' ')
for k in ('theorem', 'lemma', 'def', 'inductive', 'structure', 'instance', 'class', 'axiom', 'abbrev', 'end ', 'namespace ', 'section ')
for p in ('', 'private ', 'protected ', 'noncomputable ', 'partial ', 'private partial '))
if is_decl or next_stripped.startswith('#') or next_stripped.startswith('set_option'):
break
i += 1
body = '\n'.join(lines[start_line:i])
has_sorry = 'sorry' in body and kind in ('theorem', 'lemma', 'def')
# Pass 2: Extract match cases for defs
cases = _extract_cases(lines[start_line:i])
decls.append({
'kind': kind,
'name': name,
'line': start_line + 1,
'end_line': i,
'body': body,
'sorry': has_sorry,
'cases': cases,
'num_lines': i - start_line,
})
return {'declarations': decls}
def _extract_cases(lines: list[str]) -> dict[str, str]:
"""Extract match cases from a list of lines belonging to a declaration.
Returns dict mapping case label to case body."""
cases = {}
current_label = None
current_lines = []
for line in lines:
stripped = line.lstrip()
# Detect case pattern: starts with `| .name` or `| «name»`
if stripped.startswith('| .') or stripped.startswith('| \u00ab'):
# Save previous case
if current_label is not None and current_lines:
cases[current_label] = '\n'.join(current_lines)
# Parse case name
after_pipe = stripped[2:].lstrip()
if after_pipe.startswith('.'):
# .name or .name(args) or .«name»
rest = after_pipe[1:]
name_chars = []
for ch in rest:
if ch in ' (\n,|':
break
name_chars.append(ch)
current_label = ''.join(name_chars)
elif after_pipe.startswith('\u00ab'):
# «name»
current_label = after_pipe[1:].split('\u00bb')[0]
else:
current_label = after_pipe.split()[0] if after_pipe.split() else '?'
current_lines = [line]
elif current_label is not None:
current_lines.append(line)
# Save last case
if current_label is not None and current_lines:
cases[current_label] = '\n'.join(current_lines)
return cases
def parse_step_function(content: str, func_name: str = 'step?') -> tuple[dict[str, str], str]:
"""Extract cases from a specific step function.
Returns (cases_dict, full_body_string)."""
result = parse_lean_file(content)
for decl in result['declarations']:
if decl['name'] == func_name:
return decl['cases'], decl['body']
return {}, ''
def diff_cases(old: dict[str, str], new: dict[str, str]) -> tuple[dict, dict, dict]:
"""Diff two case dicts. Returns (added, removed, changed)."""
added = {k: v for k, v in new.items() if k not in old}
removed = {k: v for k, v in old.items() if k not in new}
changed = {k: (old[k], new[k]) for k in old if k in new and old[k] != new[k]}
return added, removed, changed
# ── Self-test ──
if __name__ == '__main__':
import sys
path = sys.argv[1] if len(sys.argv) > 1 else 'VerifiedJS/Core/Semantics.lean'
content = open(path).read()
result = parse_lean_file(content)
decls = result['declarations']
print(f'{len(decls)} declarations in {path}')
theorems = [d for d in decls if d['kind'] in ('theorem', 'lemma')]
defs = [d for d in decls if d['kind'] == 'def']
print(f' {len(theorems)} theorems/lemmas, {len(defs)} defs')
sorry_count = sum(1 for d in decls if d['sorry'])
print(f' {sorry_count} with sorry')
# Show step? cases
cases, body = parse_step_function(content, 'step?')
print(f'\nstep?: {len(cases)} cases, {len(body.splitlines())} lines')
for k in sorted(cases)[:8]:
print(f' | .{k}: {cases[k].count(chr(10))+1}L')
if len(cases) > 8:
print(f' ... and {len(cases)-8} more')