Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Fuzzing Without Specifications: Learning Struct...

Fuzzing Without Specifications: Learning Structure from Behaviour Part I

Avatar for Rahul Gopinath

Rahul Gopinath

June 04, 2025
Tweet

More Decks by Rahul Gopinath

Other Decks in Research

Transcript

  1. 9 $ ./fuzz [;x1-GPZ+wcckc];,N9J+?#6^6\e?]9lu
 2_%'4GX"0VUB[E/r ~fApu6b8<{%siq8Z
 h.6{V,hr?;{Ti.r3PIxMMMv6{xS^+'Hq!
 AxB"YXRS@!Kd6;wtAMefFWM(`|J_<1~o}
 z3K(CCzRH JIIvHz>_*.\>JrlU32~eGP?


    lR=bF3+;y$3lodQ<B89!5"W2fK*vE7v{'
 )KC-i,c{<[~m!]o;{.'}Gj\(X}EtYetrp
 bY@aGZ1{P!AZU7x#4(Rtn!q4nCwqol^y6
 }0|Ko=*JK~;zMKV=9Nai:wxu{J&UV#HaU
 )*BiC<),`+t*gka<W=Z.%T5WGHZpI30D<
 Pq>&]BS6R&j?#tP7iaV}-}`\?[_[Z^LBM
 PG-FKj'\xwuZ1=Q`^`5,$N$Q@[!CuRzJ2
 D|vBy!^zkhdf3C5PAkR?V((-%><hn|3='
 i2Qx]D$qs4O`1@fevnG'2\11Vf3piU37@
 5:dfd45*(7^%5ap\zIyl"'f,$ee,J4Gw:
 cgNKLie3nx9(`efSlg6#[K"@WjhZ}r[Sc
 un&sBCS,T[/3]KAeEnQ7lU)3Pn,0)G/6N
 -wyzj/MTd#A;r*(ds./df3r8Odaf?/<#r Program ✘ Random Fuzzing
  2. 10

  3. 13

  4. @app.route('/admin') def admin(): username = request.cookies.get("username") if not username: return

    {"Error": "Specify username in Cookie"} username = urllib.quote(os.path.basename(username)) url = "http://permissions:5000/permissions/{}".format(username) resp = requests.request(method="GET", url=url) # "superadmin\ud888" will be simpli fi ed to "superadmin" ret = ujson.loads(resp.text) if resp.status_code == 200: if "superadmin" in ret["roles"]: return {"OK": "Superadmin Access granted"} else: e = u"Access denied. User has following roles: {}".format(ret["roles"]) return {"Error": e}, 401 else:return {"Error": ret["Error"]}, 500 [ ; x 1 - G P Z + w c c k c ] ; , N 9 J + ? # 6 ^ 6 \ e ? ] 9 l u 2 _ % ' 4 G X " 0 V U B [ E / r ~ f A p u 6 b 8 < { % s i q 8 Z h . 6 { V , h r ? ; {Ti.r3PIxMMMv6{xS^+'Hq!AxB"YXRS@! Kd6;wtAMefFWM(`|J_<1~o}z3K(CCzRH J I I v H z > _ * . \ > J r l U 3 2 ~ e G P ? lR=bF3+;y$3lodQ<B89!5"W2fK*vE7v{')KC- i,c{<[~m!]o;{.'}Gj\(X}EtYetrpbY@aGZ1{P! A Z U 7 x # 4 ( R t n ! q 4 n C w q o l ^ y 6 } 0 | Ko=*JK~;zMKV=9Nai:wxu{J&UV#HaU)*Bi C < ) , ` + t * g k a < W = Z . % T 5 W G H Z p I 3 0 D < P q > & ] B S 6 R & j ? # t P 7 i a V } - } ` \ ? [ _ [ Z ^ L B M P G - FKj'\xwuZ1=Q`^`5,$N$Q@[!CuRzJ2D|vBy! ^ z k h d f 3 C 5 P A k R ? V ( ( - % > < h n | 3='i2Qx]D$qs4O`1@fevnG'2\11Vf3piU37@ 5 : d f d 4 5 * ( 7 ^ % 5 a p \ z I y l " ' f , $ee,J4Gw:cgNKLie3nx9(`efSlg6#[K"@Wjh Z}r[Scun&sBCS,T[/3]KAeEnQ7lU)3Pn,0)G/ 6N-wyzj/MTd#A;r Program 14 https://www.fuzzingbook.org/html/Fuzzer.html Traditional Fuzzing
  5. • Insert Instrumentation • Generate inputs • Collect execution feedback

    • Branches covered during execution • Slightly Mutate Input and try again Collect inputs obtaining new coverage 16 Feedback Driven Fuzzing 16 https://www.fuzzingbook.org/html/MutationFuzzer.html
  6. 17 def triangle(a, b, c): if a == b: if

    b == c: return Equilateral else: return Isosceles else: if b == c: return Isosceles else: if a == c: return Isosceles else: return Scalene def triangle(a, b, c): __probe_enter() if a == b: __probe_1() if b == c: __probe_2() return Equilateral else: __probe_3() return Isosceles else: __probe_4() if b == c: __probe_5() return Isosceles else: __probe_6() if a == c: __probe_7() return Isosceles else: __probe_8() return Scalene def triangle(a, b, c): if a == b: if b == c: return Equilateral else: return Isosceles else: if b == c: return Isosceles else: if a == c: return Isosceles else: return Scalene • Insert Instrumentation • Generate inputs • Collect execution feedback • Branches covered during execution • Slightly Mutate Input and try again Collect inputs obtaining new coverage 17 https://www.fuzzingbook.org/html/MutationFuzzer.html Feedback Driven Fuzzing
  7. 18 Feedback Driven Fuzzing triangle (1,1,1) def triangle(a, b, c):

    if a == b: if b == c: return Equilateral else: return Isosceles else: if b == c: return Isosceles else: if a == c: return Isosceles else: return Scalene • Insert Instrumentation • Generate inputs • Collect execution feedback • Branches covered during execution • Slightly Mutate Input and try again Collect inputs obtaining new coverage 18 https://www.fuzzingbook.org/html/MutationFuzzer.html
  8. 19 triangle (1,1,1) def triangle(a, b, c): if a ==

    b: if b == c: return Equilateral else: return Isosceles else: if b == c: return Isosceles else: if a == c: return Isosceles else: return Scalene • Insert Instrumentation • Generate inputs • Collect execution feedback • Branches covered during execution • Slightly Mutate Input and try again Collect inputs obtaining new coverage 19 https://www.fuzzingbook.org/html/MutationFuzzer.html Feedback Driven Fuzzing
  9. triangle (1,1,1) 20 triangle (1,1,2) def triangle(a, b, c): if

    a == b: if b == c: return Equilateral else: return Isosceles else: if b == c: return Isosceles else: if a == c: return Isosceles else: return Scalene • Insert Instrumentation • Generate inputs • Collect execution feedback • Branches covered during execution • Slightly Mutate Input and try again Collect inputs obtaining new coverage Mutated 20 https://www.fuzzingbook.org/html/MutationFuzzer.html Feedback Driven Fuzzing
  10. 21 triangle (1,1,3) def triangle(a, b, c): if a ==

    b: if b == c: return Equilateral else: return Isosceles else: if b == c: return Isosceles else: if a == c: return Isosceles else: return Scalene • Insert Instrumentation • Generate inputs • Collect execution feedback • Branches covered during execution • Slightly Mutate Input and try again Collect inputs obtaining new coverage Mutated 21 https://www.fuzzingbook.org/html/MutationFuzzer.html Feedback Driven Fuzzing
  11. Feedback Driven Fuzzing 22 triangle (1,1,2) def triangle(a, b, c):

    if a == b: if b == c: return Equilateral else: return Isosceles else: if b == c: return Isosceles else: if a == c: return Isosceles else: return Scalene • Insert Instrumentation • Generate inputs • Collect execution feedback • Branches covered during execution • Slightly Mutate Input and try again Collect inputs obtaining new coverage 22 https://www.fuzzingbook.org/html/MutationFuzzer.html triangle (1,1,1) triangle (1,1,3)
  12. Feedback Driven Fuzzing • Insert Instrumentation • Generate inputs •

    Collect execution feedback • Branches covered during execution • Slightly Mutate Input and try again Collect inputs obtaining new coverage 23 triangle (1,1,2) def triangle(a, b, c): if a == b: if b == c: return Equilateral else: return Isosceles else: if b == c: return Isosceles else: if a == c: return Isosceles else: return Scalene triangle (1,1,1) AFL 23
  13. 24 Feedback Driven Fuzzing Weakness: static int is_reserved_word_token(const char *s,

    int len) { const char *reserved[] = { "break", "case", "catch", "continue", "debugger", "default", "delete", "do", "else", "false", "finally", "for", "function", "if", "in", "instanceof", "new", "null", "return", "switch", "this", "throw", "true", "try", "typeof", "var", "void", "while", "with", "let", "undefined", ((void *)0)}; int i; if (!mjs_is_alpha(s[0])) return 0; for (i = 0; reserved[i] != ((void *)0); i++) { if (len == (int)strlen(reserved[i]) && strncmp(s, reserved[i], len) == 0) return i + 1; } return 0; } Tokens if (x > 100) { } coverage: 20% if (x > 100) { } e coverage: 5% if (x > 100) { } el coverage: 5% if (x > 100) { } els coverage: 5% if (x > 100) { } else coverage: 25% No smooth coverage gradient in parsers 24 Note: Constant unrolling (e.g. AFL) does not help in such lexical tokens
  14. 25 def json_raw(stm): while True: stm.skipspaces() c = stm.peek() if

    c == 't': return json_fixed(stm, 'true') elif c == 'f': return json_fixed(stm, 'false') elif c == 'n': return json_fixed(stm, 'null') elif c == '"': return json_string(stm) elif c == '{': return json_dict(stm) elif c == '[': return json_list(stm) elif c in NUMSTART: return json_number(stm) raise JSONError(E_MALF, stm, stm.pos) Weak points: • Need for smooth coverage gradient • Coverage only provides first level guidance 1. {"abc":[]} 2. [{"a":[]}, {"b":[]}, {"c":["ab","c"]}] 25 Feedback Driven Fuzzing
  15. 26 def json_raw(stm): while True: stm.skipspaces() c = stm.peek() if

    c == 't': return json_fixed(stm, 'true') elif c == 'f': return json_fixed(stm, 'false') elif c == 'n': return json_fixed(stm, 'null') elif c == '"': return json_string(stm) elif c == '{': return json_dict(stm) elif c == '[': return json_list(stm) elif c in NUMSTART: return json_number(stm) raise JSONError(E_MALF, stm, stm.pos) Weak points: • Need for smooth coverage gradient • Coverage only provides first level guidance 1. {"abc":[]} 2. [{"a":[]}, {"b":[]}, {"c":["ab","c"]}] 26 Feedback Driven Fuzzing
  16. 27 def json_raw(stm): while True: stm.skipspaces() c = stm.peek() if

    c == 't': return json_fixed(stm, 'true') elif c == 'f': return json_fixed(stm, 'false') elif c == 'n': return json_fixed(stm, 'null') elif c == '"': return json_string(stm) elif c == '{': return json_dict(stm) elif c == '[': return json_list(stm) elif c in NUMSTART: return json_number(stm) raise JSONError(E_MALF, stm, stm.pos) Weak points: • Need for smooth coverage gradient • Coverage only provides first level guidance 1. {"abc":[]} 2. [{"a":[]}, {"b":[]}, {"c":["ab","c"]}] 27 Feedback Driven Fuzzing
  17. 28

  18. 31 Fuzzing Parsers $ ./fuzz [;x1-GPZ+wcckc];,N9J+?#6^6\e?]9lu
 2_%'4GX"0VUB[E/r ~fApu6b8<{%siq8Z
 h.6{V,hr?;{Ti.r3PIxMMMv6{xS^+'Hq!
 AxB"YXRS@!Kd6;wtAMefFWM(`|J_<1~o}


    z3K(CCzRH JIIvHz>_*.\>JrlU32~eGP?
 lR=bF3+;y$3lodQ<B89!5"W2fK*vE7v{'
 )KC-i,c{<[~m!]o;{.'}Gj\(X}EtYetrp
 bY@aGZ1{P!AZU7x#4(Rtn!q4nCwqol^y6
 }0|Ko=*JK~;zMKV=9Nai:wxu{J&UV#HaU
 )*BiC<),`+t*gka<W=Z.%T5WGHZpI30D<
 Pq>&]BS6R&j?#tP7iaV}-}`\?[_[Z^LBM
 PG-FKj'\xwuZ1=Q`^`5,$N$Q@[!CuRzJ2
 D|vBy!^zkhdf3C5PAkR?V((-%><hn|3='
 i2Qx]D$qs4O`1@fevnG'2\11Vf3piU37@
 5:dfd45*(7^%5ap\zIyl"'f,$ee,J4Gw:
 cgNKLie3nx9(`efSlg6#[K"@WjhZ}r[Sc
 un&sBCS,T[/3]KAeEnQ7lU)3Pn,0)G/6N
 -wyzj/MTd#A;r*(ds./df3r8Odaf?/<#r Parser Syntax Error Interpreter #
  19. 32 Leveraging Instrumentation Based Feedback def parse_num(s,i): n = ''

    while s[i:] and is_digit(s[i]): n += s[i] i = i +1 return i,n def parse_paren(s, i): assert s[i] == '(' i, v = parse_expr(s, i+1) if s[i:] == '': raise Exception(s, i) assert s[i] == ')' return i+1, v def parse_expr(s, i = 0): expr = [] is_op = True while s[i:]: c = s[i] if c in string.digits: if not is_op: raise Exception(s,i) i,num = parse_num(s,i) expr.append(num) is_op = False elif c in ['+', '-', '*', '/']: if is_op: raise Exception(s,i) expr.append(c) is_op = True i = i + 1 elif c == '(': if not is_op: raise Exception(s,i) i, cexpr = parse_paren(s, i) expr.append(cexpr) is_op = False elif c == ')': break else: raise Exception(s,i) if is_op: raise Exception(s,i) return i, expr Parser Syntax Error Interpreter #
  20. 38 Speci fi cation Free Generators A ( 2 -

    B 9 ) 4 ) A ∉ (,+,-,1,2,3,4,5,6,7,8,9,0 B ∉ +,-,1,2,3,4,5,6,7,8,9,0,) ) ∉ +,-,1,2,3,4,5,6,7,8,9,0 (2-94)
  21. 39

  22. 43 Formal Languages Formal Language Descriptions 3. Regular Context Free

    Recursively Enumerable (Chomsky,1956) Easy to produce and parse Argument Stack Return Stack
  23. 44 Grammar <start> := <expr> <expr> := <expr> '+' <expr>

    | <expr> '-' <expr> | <expr> '/' <expr> | <expr> '*' <expr> | '(' <expr> ')' | <number> <number> := <integer> | <integer> '.' <integer> <integer>:= <digit> <integer> | <digit> <digit> := [0-9] Arithmetic expression grammar De f inition for <expr> <expr> key
  24. 45 <start> := <expr> <expr> := <expr> '+' <expr> |

    <expr> '-' <expr> | <expr> '/' <expr> | <expr> '*' <expr> | '(' <expr> ')' | <number> <number> := <integer> | <integer> '.' <integer> <integer>:= <digit> <integer> | <digit> <digit> := [0-9] Grammar Arithmetic expression grammar Expansion Rule Terminal Symbol Nonterminal Symbol
  25. 47 Grammars For Parsing (8 / 3) * 49 <start>

    := <expr> <expr> := <expr> '+' <expr> | <expr> '-' <expr> | <expr> '/' <expr> | <expr> '*' <expr> | '(' <expr> ')' | <number> <number> := <integer> | <integer> '.' <integer> <integer>:= <digit> <integer> | <digit> <digit> := [0-9]
  26. 49 Parsing is Surprisingly Integral to Fuzzing 1. Parsing is

    necessary for decomposing and recomposing inputs without affecting the validity of inputs constructed (8 / 3) * 49
  27. 50 Parsing is Surprisingly Integral to Fuzzing 1. Parsing is

    necessary for decomposing and recomposing inputs without affecting the validity of inputs constructed (8 / 3) * 49
  28. 51 Parsing is Surprisingly Integral to Fuzzing 1. Parsing is

    necessary for decomposing and recomposing inputs without affecting the validity of inputs constructed (2 + 1) * 49 1 2 + (8 / 3) * 49
  29. 52 Parsing is Surprisingly Integral to Fuzzing 2. Use parsers

    to mine the input distribution, and generate uncommon inputs
  30. 53 Parsing is Surprisingly Integral to Fuzzing 2. Use parsers

    to extract the complexity of an input or a set of inputs 18439249 (8/3)*49 https://rahul.gopinath.org/post/2024/03/23/k-paths-for-context-free-grammars/
  31. 54

  32. 55 Grammars 8.2 - 27 - -9 / +((+9 *

    --2 + --+-+- ((-1 * +(8 - 5 - 6)) * (-(a-+(((+(4) )))) - ++4) / +(-+---((5.6 - --(3 * -1.8 * +(6 * +-(((-(-6) * ---+6)) / +--(+-+-7 * (-0 * (+(((((2)) + 8 - 3 - ++9.0 + ---(--+7 / (1 / +++6.37) + (1) / 482) / +++-+0)))) + 8.2 - 27 - -9 / +((+9 * --2 + --+-+-((-1 * + (8 - 5 - 6)) * (-(a-+(((+(4))))) - + +4) / +(-+---((5.6 - --(3 * -1.8 * + (6 * +-(((-(-6) * ---+6)) / +--(+-+- 7 * (-0 * (+(((((2)) + 8 - 3 - ++9.0 + ---(--+7 / (1 / +++6.37) + (1) / 482) / +++-+0)))) * -+5 + 7.513)))) - (+1 / ++((-84)))))))) * ++5 / +-(- -2 - -++-9.0)))) / 5 * --++090 + * - +5 + 7.513)))) - (+1 / ++((-84)))))) )) * 8.2 - 27 - -9 / +((+9 * --2 + - -+-+-((-1 * +(8 - 5 - 6)) * (-(a-+(( (+(4))))) - ++4) / +(-+---((5.6 - -- (3 * -1.8 * +(6 * +-(((-(-6) * ---+6 )) / +--(+-+-7 * (-0 * (+(((((2)) + 8 - 3 - ++9.0 + ---(--+7 / (1 / +++6 .37) + (1) / 482) / +++-+0)))) * -+5 + 7.513)))) - (+1 / ++((-84)))))))) * ++5 / +-(--2 - -++-9.0)))) / 5 * --++090 ++5 / +-(--2 - -++-9.0)))) / 5 * --++090 <start> := <expr> <expr> := <expr> '+' <expr> | <expr> '-' <expr> | <expr> '/' <expr> | <expr> '*' <expr> | '(' <expr> ')' | <number> <number> := <integer> | <integer> '.' <integer> <integer>:= <digit> <integer> | <digit> <digit> := [0-9] For Fuzzing (Hanford 1970) (Purdom 1972)
  33. 56 Grammars As effective producers Interpreter Parser ✘ ✔ 8.2

    - 27 - -9 / +((+9 * --2 + --+-+- ((-1 * +(8 - 5 - 6)) * (-(a-+(((+(4) )))) - ++4) / +(-+---((5.6 - --(3 * -1.8 * +(6 * +-(((-(-6) * ---+6)) / +--(+-+-7 * (-0 * (+(((((2)) + 8 - 3 - ++9.0 + ---(--+7 / (1 / +++6.37) + (1) / 482) / +++-+0)))) + 8.2 - 27 - -9 / +((+9 * --2 + --+-+-((-1 * + (8 - 5 - 6)) * (-(a-+(((+(4))))) - + +4) / +(-+---((5.6 - --(3 * -1.8 * + (6 * +-(((-(-6) * ---+6)) / +--(+-+- 7 * (-0 * (+(((((2)) + 8 - 3 - ++9.0 + ---(--+7 / (1 / +++6.37) + (1) / 482) / +++-+0)))) * -+5 + 7.513)))) - (+1 / ++((-84)))))))) * ++5 / +-(- -2 - -++-9.0)))) / 5 * --++090 + * - +5 + 7.513)))) - (+1 / ++((-84)))))) )) * 8.2 - 27 - -9 / +((+9 * --2 + - -+-+-((-1 * +(8 - 5 - 6)) * (-(a-+(( (+(4))))) - ++4) / +(-+---((5.6 - -- (3 * -1.8 * +(6 * +-(((-(-6) * ---+6 )) / +--(+-+-7 * (-0 * (+(((((2)) + 8 - 3 - ++9.0 + ---(--+7 / (1 / +++6 .37) + (1) / 482) / +++-+0)))) * -+5 + 7.513)))) - (+1 / ++((-84)))))))) * ++5 / +-(--2 - -++-9.0)))) / 5 * --++090 ++5 / +-(--2 - -++-9.0)))) / 5 * --++090
  34. 57 Grammars <start> := <expr> <expr> := <expr> '+' <expr>

    | <expr> '-' <expr> | <expr> '/' <expr> | <expr> '*' <expr> | '(' <expr> ')' | <number> <number> := <integer> | <integer> '.' <integer> <integer>:= <digit> <integer> | <digit> <digit> := [0-9] As efficient producers def start(): expr() def expr(): match (random() % 6): case 0: expr(); print('+'); expr() case 1: expr(); print('-'); expr() case 2: expr(); print('/'); expr() case 3: expr(); print('*'); expr() case 4: print('('); expr(); print(')') case 5: number() def number(): match (random() % 2): case 0: integer() case 1: integer(); print('.'); integer() def integer(): match (random() % 2): case 0: digit(); integer() case 1: digit() def digit(): match (random() % 10): case 0: print('0') case 1: print('1') case 2: print('2') case 3: print('3') case 4: print('4') case 5: print('5') case 6: print('6') case 7: print('7') Compiled Grammar (F1)
  35. 63 Where to Get the Grammar From? 1. Extract the

    input string accesses 2. Attach control fl ow information Hand-written parsers already encode the grammar Each control- fl ow structure gets wrapped in a context-manager -If conditionals -Loops -Subroutines
  36. 64 How to Extract This Grammar? • Inputs + control

    fl ow -> Dynamic Control Dependence Trees • DCD Trees -> Parse Tree
  37. 65 Control Dependence Graph Statement B is control dependent on

    A if A determines whether B executes. def parse_csv(s,i): while s[i:]: if is_digit(s[i]): n,j = num(s[i:]) i = i+j else: comma(s[i]) i += 1 CDG for parse_csv while: determines whether if: executes
  38. 66 def parse_csv(s,i): while s[i:]: if is_digit(s[i]): n,j = num(s[i:])

    i = i+j else: comma(s[i]) i += 1 CDG for parse_csv Dynamic Control Dependence Tree Each statement execution is represented as a separate node DCD Tree for call parse_csv()
  39. 67 def parse_csv(s,i): while s[i:]: if is_digit(s[i]): n,j = num(s[i:])

    i = i+j else: comma(s[i]) i += 1 '1' '2' ',' DCD Tree ~ Parse Tree •No tracking beyond input bu ff er •Characters are attached to nodes where they are accessed last "12," "12,"
  40. 69

  41. 70 def is_digit(i): return i in '0123456789' def parse_num(s,i): n

    = '' while s[i:] and is_digit(s[i]): n += s[i] i = i +1 return i,n def parse_paren(s, i): assert s[i] == '(' i, v = parse_expr(s, i+1) if s[i:] == '': raise Ex(s, i) assert s[i] == ')' return i+1, v def parse_expr(s, i = 0): expr, is_op = [], True while s[i:]: c = s[i] if isdigit(c): if not is_op: raise Ex(s,i) i,num = parse_num(s,i) expr.append(num) is_op = False elif c in ['+', '-', '*', '/']: if is_op: raise Ex(s,i) expr.append(c) is_op, i = True, i + 1 elif c == '(': if not is_op: raise Ex(s,i) i, cexpr = parse_paren(s, i) expr.append(cexpr) is_op = False elif c == ')': break else: raise Ex(s,i) if is_op: raise Ex(s,i) return i, expr 9+3/4 Parse tree for parse_expr('9+3/4')
  42. 71

  43. 72 def is_digit(i): return i in '0123456789' def parse_num(s,i): n

    = '' while s[i:] and is_digit(s[i]): n += s[i] i = i +1 return i,n def parse_paren(s, i): assert s[i] == '(' i, v = parse_expr(s, i+1) if s[i:] == '': raise Ex(s, i) assert s[i] == ')' return i+1, v def parse_expr(s, i = 0): expr, is_op = [], True while s[i:]: c = s[i] if isdigit(c): if not is_op: raise Ex(s,i) i,num = parse_num(s,i) expr.append(num) is_op = False elif c in ['+', '-', '*', '/']: if is_op: raise Ex(s,i) expr.append(c) is_op, i = True, i + 1 elif c == '(': if not is_op: raise Ex(s,i) i, cexpr = parse_paren(s, i) expr.append(cexpr) is_op = False elif c == ')': break else: raise Ex(s,i) if is_op: raise Ex(s,i) return i, expr 9+3/4 Identifying Compatible Nodes Which nodes correspond to the same nonterminal
  44. <parse_expr> := <while 1:1> <while 1:0> <while 1:1> | <while

    1:1> <while 1:0> <while 1:1> <while 1:0> <while 1:1> <while 1:0> <while 1:1> | <while 1:1> <while 1:0> <while 1:1> <while 1:0> <while 1:1> | <while 1:1> <while 1:1> := <if 1:1> <if 1:1> := <parse_num> | <parse_paren> <parse_num> := <is_digit> <is_digit> := '3' | '1' <parse_paren>:= '(' <parse_expr> ')' <while 1:0> := <if 1:0> <if 1:0> := '*' 78
  45. <parse_expr> := <while_s> <while_s> := <while_1:1> <while_1:0> <while_s> | <while_1:1>

    <parse_expr> := <while 1:1> <while 1:0> <while 1:1> | <while 1:1> <while 1:0> <while 1:1> <while 1:0> <while 1:1> <while 1:0> <while 1:1> | <while 1:1> <while 1:0> <while 1:1> <while 1:0> <while 1:1> | <while 1:1> <while 1:1> := <if 1:1> <if 1:1> := <parse_num> | <parse_paren> <parse_num> := <is_digit> <is_digit> := '3' | '1' <parse_paren>:= '(' <parse_expr> ')' <while 1:0> := <if 1:0> <if 1:0> := '*' 79 Generalizing Loops with Regular Inference
  46. 80

  47. 81 def is_digit(i): return i in '0123456789' def parse_num(s,i): n

    = '' while s[i:] and is_digit(s[i]): n += s[i] i = i +1 return i,n def parse_paren(s, i): assert s[i] == '(' i, v = parse_expr(s, i+1) if s[i:] == '': raise Ex(s, i) assert s[i] == ')' return i+1, v def parse_expr(s, i = 0): expr, is_op = [], True while s[i:]: c = s[i] if isdigit(c): if not is_op: raise Ex(s,i) i,num = parse_num(s,i) expr.append(num) is_op = False elif c in ['+', '-', '*', '/']: if is_op: raise Ex(s,i) expr.append(c) is_op, i = True, i + 1 elif c == '(': if not is_op: raise Ex(s,i) i, cexpr = parse_paren(s, i) expr.append(cexpr) is_op = False elif c == ')': break else: raise Ex(s,i) if is_op: raise Ex(s,i) return i, expr <START> := <parse_expr.0-0-c> <parse_expr.0-0-c> := <parse_expr.0-1-s><parse_expr.0> | <parse_expr.0> <parse_expr.0-1-s> := <parse_expr.0><parse_expr.0-2> | <parse_expr.0><parse_expr.0-2><parse_expr.0-1-s> <parse_expr.0> := '(' <parse_expr.0-0-c> ')' | <parse_num.0-1-s> <parse_expr.0-2> := '*' | '+' | '-' | '/' <parse_num.0-1-s> := <is_digit.0-0-c> | <is_digit.0-0-c><parse_num.0-1-s> <is_digit.0-0-c> : [0-9] calc.py Recovered Arithmetic Grammar
  48. 82 8.2 - 27 - -9 / +((+9 * --2

    + --+-+- ((-1 * +(8 - 5 - 6)) * (-(a-+(((+(4) )))) - ++4) / +(-+---((5.6 - --(3 * -1.8 * +(6 * +-(((-(-6) * ---+6)) / +--(+-+-7 * (-0 * (+(((((2)) + 8 - 3 - ++9.0 + ---(--+7 / (1 / +++6.37) + (1) / 482) / +++-+0)))) + 8.2 - 27 - -9 / +((+9 * --2 + --+-+-((-1 * + (8 - 5 - 6)) * (-(a-+(((+(4))))) - + +4) / +(-+---((5.6 - --(3 * -1.8 * + (6 * +-(((-(-6) * ---+6)) / +--(+-+- 7 * (-0 * (+(((((2)) + 8 - 3 - ++9.0 + ---(--+7 / (1 / +++6.37) + (1) / 482) / +++-+0)))) * -+5 + 7.513)))) - (+1 / ++((-84)))))))) * ++5 / +-(- -2 - -++-9.0)))) / 5 * --++090 + * - +5 + 7.513)))) - (+1 / ++((-84)))))) )) * 8.2 - 27 - -9 / +((+9 * --2 + - -+-+-((-1 * +(8 - 5 - 6)) * (-(a-+(( (+(4))))) - ++4) / +(-+---((5.6 - -- (3 * -1.8 * +(6 * +-(((-(-6) * ---+6 )) / +--(+-+-7 * (-0 * (+(((((2)) + 8 - 3 - ++9.0 + ---(--+7 / (1 / +++6 .37) + (1) / 482) / +++-+0)))) * -+5 + 7.513)))) - (+1 / ++((-84)))))))) * ++5 / +-(--2 - -++-9.0)))) / 5 * --++090 ++5 / +-(--2 - -++-9.0)))) / 5 * --++090 <START> := <parse_expr.0-0-c> <parse_expr.0-0-c> := <parse_expr.0-1-s><parse_expr.0> | <parse_expr.0> <parse_expr.0-1-s> := <parse_expr.0><parse_expr.0-2> | <parse_expr.0><parse_expr.0-2><parse_expr.0-1-s> <parse_expr.0> := '(' <parse_expr.0-0-c> ')' | <parse_num.0-1-s> <parse_expr.0-2> := '*' | '+' | '-' | '/' <parse_num.0-1-s> := <is_digit.0-0-c> | <is_digit.0-0-c><parse_num.0-1-s> <is_digit.0-0-c> : [0-9]
  49. 83 <START> ::= <json_raw> <json_raw> ::= '"' <json_string'> | '['

    <json_list'> | '{' <json_dict'> | <json_number'> | 'true' | 'false' | 'null' <json_number'> ::= <json_number>+ | <json_number>+ 'e' <json_number>+ <json_number> ::= '+' | '-' | '.' | [0-9] | 'E' | 'e' <json_string'> ::= <json_string>* '"' <json_list'> ::= ']' | <json_raw> (','<json_raw>)* ']' | ( ',' <json_raw>)+ (',' <json_raw>)* ']' <json_dict'> ::= '}' | ( '"' <json_string'> ':' <json_raw> ',' )* '"'<json_string'> ':' <json_raw> '}' <json_string> ::= ' ' | '!' | '#' | '$' | '%' | '&' | ''' | '*' | '+' | '-' | ',' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '[' | ']' | '^' | '_', ''',| '{' | '|' | '}' | '~' | '[A-Za-z0-9]' | '\' <decode_escape> <decode_escape> ::= '"' | '/' | 'b' | 'f' | 'n' | 'r' | 't' stm.next() if expect_key: raise JSONError(E_DKEY, stm, stm.pos) if c == '}': return result expect_key = 1 continue # parse out a key/value pair elif c == '"': key = _from_json_string(stm) stm.skipspaces() c = stm.next() if c != ':': raise JSONError(E_COLON, stm, stm.pos) stm.skipspaces() val = _from_json_raw(stm) result[key] = val expect_key = 0 continue raise JSONError(E_MALF, stm, stm.pos) def _from_json_raw(stm): while True: stm.skipspaces() c = stm.peek() if c == '"': return _from_json_string(stm) elif c == '{': return _from_json_dict(stm) elif c == '[': return _from_json_list(stm) elif c == 't': return _from_json_fixed(stm, 'true', True, E_BOOL) elif c == 'f': return _from_json_fixed(stm, 'false', False, E_BOOL) elif c == 'n': return _from_json_fixed(stm, 'null', None, E_NULL) elif c in NUMSTART: return _from_json_number(stm) raise JSONError(E_MALF, stm, stm.pos) def from_json(data): stm = JSONStream(data) return _from_json_raw(stm) microjson.py Recovered JSON grammar
  50. 84