swade-fr-content-neoflip/tools/lpeg/lpcode.lua

1057 lines
35 KiB
Lua

--[[
LPEGLJ
lpcode.lua
Generating code from tree
Copyright (C) 2014 Rostislav Sacek.
based on LPeg v1.0 - PEG pattern matching for Lua
Lua.org & PUC-Rio written by Roberto Ierusalimschy
http://www.inf.puc-rio.br/~roberto/lpeg/
** Permission is hereby granted, free of charge, to any person obtaining
** a copy of this software and associated documentation files (the
** "Software"), to deal in the Software without restriction, including
** without limitation the rights to use, copy, modify, merge, publish,
** distribute, sublicense, and/or sell copies of the Software, and to
** permit persons to whom the Software is furnished to do so, subject to
** the following conditions:
**
** The above copyright notice and this permission notice shall be
** included in all copies or substantial portions of the Software.
**
** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
**
** [ MIT license: http://www.opensource.org/licenses/mit-license.php ]
--]]
local ffi = require "ffi"
require "lpvm"
local band, bor, bnot, rshift, lshift = bit.band, bit.bor, bit.bnot, bit.rshift, bit.lshift
local TChar = 0
local TSet = 1
local TAny = 2 -- standard PEG elements
local TTrue = 3
local TFalse = 4
local TRep = 5
local TSeq = 6
local TChoice = 7
local TNot = 8
local TAnd = 9
local TCall = 10
local TOpenCall = 11
local TRule = 12 -- sib1 is rule's pattern, sib2 is 'next' rule
local TGrammar = 13 -- sib1 is initial (and first) rule
local TBehind = 14 -- match behind
local TCapture = 15 -- regular capture
local TRunTime = 16 -- run-time capture
local IAny = 0 -- if no char, fail
local IChar = 1 -- if char != val, fail
local ISet = 2 -- if char not in val, fail
local ITestAny = 3 -- in no char, jump to 'offset'
local ITestChar = 4 -- if char != val, jump to 'offset'
local ITestSet = 5 -- if char not in val, jump to 'offset'
local ISpan = 6 -- read a span of chars in val
local IBehind = 7 -- walk back 'val' characters (fail if not possible)
local IRet = 8 -- return from a rule
local IEnd = 9 -- end of pattern
local IChoice = 10 -- stack a choice; next fail will jump to 'offset'
local IJmp = 11 -- jump to 'offset'
local ICall = 12 -- call rule at 'offset'
local IOpenCall = 13 -- call rule number 'offset' (must be closed to a ICall)
local ICommit = 14 -- pop choice and jump to 'offset'
local IPartialCommit = 15 -- update top choice to current position and jump
local IBackCommit = 16 -- "fails" but jump to its own 'offset'
local IFailTwice = 17 -- pop one choice and then fail
local IFail = 18 -- go back to saved state on choice and jump to saved offset
local IGiveup = 19 -- internal use
local IFullCapture = 20 -- complete capture of last 'off' chars
local IOpenCapture = 21 -- start a capture
local ICloseCapture = 22
local ICloseRunTime = 23
local Cclose = 0
local Cposition = 1
local Cconst = 2
local Cbackref = 3
local Carg = 4
local Csimple = 5
local Ctable = 6
local Cfunction = 7
local Cquery = 8
local Cstring = 9
local Cnum = 10
local Csubst = 11
local Cfold = 12
local Cruntime = 13
local Cgroup = 14
local PEnullable = 0
local PEnofail = 1
local RuleLR = 0x10000
local NOINST = -2
local MAXBEHINDPREDICATE = 255
local MAXRULES = 200
local MAXOFF = 0xF
-- number of siblings for each tree
local numsiblings = {
0, 0, 0, -- char, set, any
0, 0, -- true, false
1, -- rep
2, 2, -- seq, choice
1, 1, -- not, and
0, 0, 2, 1, -- call, opencall, rule, grammar
1, -- behind
1, 1 -- capture, runtime capture
}
local patternelement = ffi.typeof('PATTERN_ELEMENT')
local pattern = ffi.typeof('PATTERN')
local settype = ffi.typeof('int32_t[8]')
local fullset = settype(-1, -1, -1, -1, -1, -1, -1, -1)
-- {======================================================
-- Analysis and some optimizations
-- =======================================================
local codegen
-- Check whether a charset is empty (IFail), singleton (IChar),
-- full (IAny), or none of those (ISet).
local function charsettype(cs)
local count = 0;
local candidate = -1; -- candidate position for a char
for i = 0, 8 - 1 do
local b = cs[i];
if b == 0 then
if count > 1 then
return ISet; -- else set is still empty
end
elseif b == -1 then
if count < (i * 32) then
return ISet;
else
count = count + 32; -- set is still full
end
-- byte has only one bit?
elseif band(b, (b - 1)) == 0 then
if count > 0 then
return ISet; -- set is neither full nor empty
-- set has only one char till now; track it
else
count = count + 1;
candidate = i;
end
else
return ISet; -- byte is neither empty, full, nor singleton
end
end
if count == 0 then
return IFail, 0 -- empty set
-- singleton; find character bit inside byte
elseif count == 1 then
local b = cs[candidate];
local c = candidate * 32;
for i = 1, 32 do
if b == 1 then
c = c + i - 1
break
end
b = rshift(b, 1)
end
return IChar, c
elseif count == 256 then
return IAny, 0 -- full set
else
assert(false) -- should have returned by now
end
end
-- A few basic operations on Charsets
local function cs_complement(cs)
for i = 0, 8 - 1 do
cs[i] = bnot(cs[i])
end
end
local function cs_equal(cs1, cs2)
for i = 0, 8 - 1 do
if cs1[i] ~= cs2[i] then
return
end
end
return true
end
-- computes whether sets st1 and st2 are disjoint
local function cs_disjoint(st1, st2)
for i = 0, 8 - 1 do
if band(st1[i], st2[i]) ~= 0 then
return
end
end
return true
end
-- Convert a 'char' pattern (TSet, TChar, TAny) to a charset
local function tocharset(tree, index, valuetable)
local val = settype()
if tree.p[index].tag == TSet then
ffi.copy(val, valuetable[tree.p[index].val], ffi.sizeof(val))
return val
elseif tree.p[index].tag == TChar then
local b = tree.p[index].val
-- only one char
-- add that one
val[rshift(b, 5)] = lshift(1, band(b, 31))
return val
elseif tree.p[index].tag == TAny then
ffi.fill(val, ffi.sizeof(val), 0xff)
return val
end
end
-- checks whether a pattern has captures
local function hascaptures(tree, index)
if tree.p[index].tag == TCapture or tree.p[index].tag == TRunTime then
return true
elseif tree.p[index].tag == TCall then
return hascaptures(tree, index + tree.p[index].ps)
else
local ns = numsiblings[tree.p[index].tag + 1]
if ns == 0 then
return
elseif ns == 1 then
return hascaptures(tree, index + 1)
elseif ns == 2 then
if hascaptures(tree, index + 1) then
return true
else
return hascaptures(tree, index + tree.p[index].ps)
end
else
assert(false)
end
end
end
-- Checks how a pattern behaves regarding the empty string,
-- in one of two different ways:
-- A pattern is *nullable* if it can match without consuming any character;
-- A pattern is *nofail* if it never fails for any string
-- (including the empty string).
-- The difference is only for predicates; for patterns without
-- predicates, the two properties are equivalent.
-- (With predicates, &'a' is nullable but not nofail. Of course,
-- nofail => nullable.)
-- These functions are all convervative in the following way:
-- p is nullable => nullable(p)
-- nofail(p) => p cannot fail
-- (The function assumes that TOpenCall and TRunTime are not nullable:
-- TOpenCall must be checked again when the grammar is fixed;
-- TRunTime is an arbitrary choice.)
local function checkaux(tree, pred, index, lrcall)
lrcall = lrcall or {}
local tag = tree.p[index].tag
if tag == TChar or tag == TSet or tag == TAny or
tag == TFalse or tag == TOpenCall then
return -- not nullable
elseif tag == TRep or tag == TTrue then
return true -- no fail
elseif tag == TNot or tag == TBehind then
-- can match empty, but may fail
if pred == PEnofail then
return
else
return true -- PEnullable
end
elseif tag == TAnd then
-- can match empty; fail iff body does
if pred == PEnullable then
return true
else
return checkaux(tree, pred, index + 1, lrcall)
end
-- can fail; match empty iff body does
elseif tag == TRunTime then
if pred == PEnofail then
return
else
return checkaux(tree, pred, index + 1, lrcall)
end
elseif tag == TSeq then
if not checkaux(tree, pred, index + 1, lrcall) then
return
else
return checkaux(tree, pred, index + tree.p[index].ps, lrcall)
end
elseif tag == TChoice then
if checkaux(tree, pred, index + tree.p[index].ps, lrcall) then
return true
else
return checkaux(tree, pred, index + 1, lrcall)
end
elseif tag == TCapture or tag == TGrammar or tag == TRule then
return checkaux(tree, pred, index + 1, lrcall)
elseif tag == TCall then
--left recursive rule
if bit.band(tree.p[index].cap, 0xffff) ~= 0 then
local lr = index + tree.p[index].ps
if lrcall[lr] then
return
end
lrcall[lr] = true
end
return checkaux(tree, pred, index + tree.p[index].ps, lrcall)
else
assert(false)
end
end
-- number of characters to match a pattern (or -1 if variable)
-- ('count' avoids infinite loops for grammars)
local function fixedlenx(tree, count, len, index)
local tag = tree.p[index].tag
if tag == TChar or tag == TSet or tag == TAny then
return len + 1;
elseif tag == TFalse or tag == TTrue or tag == TNot or tag == TAnd or tag == TBehind then
return len;
elseif tag == TRep or tag == TRunTime or tag == TOpenCall then
return -1;
elseif tag == TCapture or tag == TRule or tag == TGrammar then
return fixedlenx(tree, count, len, index + 1)
elseif tag == TCall then
if count >= MAXRULES then
return -1; -- may be a loop
else
return fixedlenx(tree, count + 1, len, index + tree.p[index].ps)
end
elseif tag == TSeq then
len = fixedlenx(tree, count, len, index + 1)
if (len < 0) then
return -1;
else
return fixedlenx(tree, count, len, index + tree.p[index].ps)
end
elseif tag == TChoice then
local n1 = fixedlenx(tree, count, len, index + 1)
if n1 < 0 then return -1 end
local n2 = fixedlenx(tree, count, len, index + tree.p[index].ps)
if n1 == n2 then
return n1
else
return -1
end
else
assert(false)
end
end
-- Computes the 'first set' of a pattern.
-- The result is a conservative aproximation:
-- match p ax -> x' for some x ==> a in first(p).
-- match p '' -> '' ==> returns 1.
-- The set 'follow' is the first set of what follows the
-- pattern (full set if nothing follows it)
local function getfirst(tree, follow, index, valuetable, lrcall)
lrcall = lrcall or {}
local tag = tree.p[index].tag
if tag == TChar or tag == TSet or tag == TAny then
local firstset = tocharset(tree, index, valuetable)
return 0, firstset
elseif tag == TTrue then
local firstset = settype()
ffi.copy(firstset, follow, ffi.sizeof(firstset))
return 1, firstset
elseif tag == TFalse then
local firstset = settype()
return 0, firstset
elseif tag == TChoice then
local e1, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall)
local e2, csaux = getfirst(tree, follow, index + tree.p[index].ps, valuetable, lrcall)
for i = 0, 8 - 1 do
firstset[i] = bor(firstset[i], csaux[i])
end
return bor(e1, e2), firstset
elseif tag == TSeq then
if not checkaux(tree, PEnullable, index + 1) then
return getfirst(tree, fullset, index + 1, valuetable, lrcall)
-- FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl))
else
local e2, csaux = getfirst(tree, follow, index + tree.p[index].ps, valuetable, lrcall)
local e1, firstset = getfirst(tree, csaux, index + 1, valuetable, lrcall)
if e1 == 0 then -- 'e1' ensures that first can be used
return 0, firstset
-- one of the children has a matchtime?
elseif band(bor(e1, e2), 2) == 2 then
return 2, firstset -- pattern has a matchtime capture
else
return e2, firstset -- else depends on 'e2'
end
end
elseif tag == TRep then
local _, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall)
for i = 0, 8 - 1 do
firstset[i] = bor(firstset[i], follow[i])
end
return 1, firstset -- accept the empty string
elseif tag == TCapture or tag == TGrammar or tag == TRule then
return getfirst(tree, follow, index + 1, valuetable, lrcall)
-- function invalidates any follow info.
elseif tag == TRunTime then
local e, firstset = getfirst(tree, fullset, index + 1, valuetable, lrcall)
if e ~= 0 then
return 2, firstset -- function is not "protected"?
else
return 0, firstset -- pattern inside capture ensures first can be used
end
elseif tag == TCall then
-- left recursive rule
if bit.band(tree.p[index].cap, 0xffff) ~= 0 then
local lr = index + tree.p[index].ps
if lrcall[lr] then
return 0, settype()
else
lrcall[lr] = true
end
end
return getfirst(tree, follow, index + tree.p[index].ps, valuetable, lrcall)
elseif tag == TAnd then
local e, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall)
for i = 0, 8 - 1 do
firstset[i] = band(firstset[i], follow[i])
end
return e, firstset
elseif tag == TNot then
local firstset = tocharset(tree, index + 1, valuetable)
if firstset then
cs_complement(firstset)
return 1, firstset
end
local e, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall)
ffi.copy(firstset, follow, ffi.sizeof(firstset))
return bor(e, 1), firstset -- always can accept the empty string
-- instruction gives no new information
elseif tag == TBehind then
-- call 'getfirst' to check for math-time captures
local e, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall)
ffi.copy(firstset, follow, ffi.sizeof(firstset))
return bor(e, 1), firstset -- always can accept the empty string
else
assert(false)
end
end
-- If it returns true, then pattern can fail only depending on the next
-- character of the subject
local function headfail(tree, index, lrcall)
lrcall = lrcall or {}
local tag = tree.p[index].tag
if tag == TChar or tag == TSet or tag == TAny or tag == TFalse then
return true
elseif tag == TTrue or tag == TRep or tag == TRunTime or tag == TNot or tag == TBehind then
return
elseif tag == TCapture or tag == TGrammar or tag == TRule or tag == TAnd then
return headfail(tree, index + 1, lrcall)
elseif tag == TCall then
-- left recursive rule
if bit.band(tree.p[index].cap, 0xffff) ~= 0 then
local lr = index + tree.p[index].ps
if lrcall[lr] then
return true
else
lrcall[lr] = true
end
end
return headfail(tree, index + tree.p[index].ps, lrcall)
elseif tag == TSeq then
if not checkaux(tree, PEnofail, index + tree.p[index].ps) then
return
else
return headfail(tree, index + 1, lrcall)
end
elseif tag == TChoice then
if not headfail(tree, index + 1, lrcall) then
return
else
return headfail(tree, index + tree.p[index].ps, lrcall)
end
else
assert(false)
end
end
-- Check whether the code generation for the given tree can benefit
-- from a follow set (to avoid computing the follow set when it is
-- not needed)
local function needfollow(tree, index)
local tag = tree.p[index].tag
if tag == TChar or tag == TSet or tag == TAny or tag == TFalse or tag == TTrue or tag == TAnd or tag == TNot or
tag == TRunTime or tag == TGrammar or tag == TCall or tag == TBehind then
return
elseif tag == TChoice or tag == TRep then
return true
elseif tag == TCapture then
return needfollow(tree, index + 1)
elseif tag == TSeq then
return needfollow(tree, index + tree.p[index].ps)
else
assert(false)
end
end
-- ======================================================
-- {======================================================
-- Code generation
-- =======================================================
-- code generation is recursive; 'opt' indicates that the code is
-- being generated under a 'IChoice' operator jumping to its end.
-- 'tt' points to a previous test protecting this code. 'fl' is
-- the follow set of the pattern.
local function addinstruction(code, op, val)
local size = code.size
if size >= code.allocsize then
code:doublesize()
end
code.p[size].code = op
code.p[size].val = val
code.size = size + 1
return size
end
local function setoffset(code, instruction, offset)
code.p[instruction].offset = offset;
end
-- Add a capture instruction:
-- 'op' is the capture instruction; 'cap' the capture kind;
-- 'key' the key into ktable; 'aux' is optional offset
local function addinstcap(code, op, cap, key, aux)
local i = addinstruction(code, op, bor(cap, lshift(aux, 4)))
setoffset(code, i, key)
return i
end
local function jumptothere(code, instruction, target)
if instruction >= 0 then
setoffset(code, instruction, target - instruction)
end
end
local function jumptohere(code, instruction)
jumptothere(code, instruction, code.size)
end
-- Code an IChar instruction, or IAny if there is an equivalent
-- test dominating it
local function codechar(code, c, tt)
assert(tt ~= -1)
if tt >= 0 and code.p[tt].code == ITestChar and
code.p[tt].val == c then
addinstruction(code, IAny, 0)
else
addinstruction(code, IChar, c)
end
end
-- Code an ISet instruction
local function coderealcharset(code, cs, valuetable)
local ind = #valuetable + 1
valuetable[ind] = cs
return addinstruction(code, ISet, ind)
end
-- code a char set, optimizing unit sets for IChar, "complete"
-- sets for IAny, and empty sets for IFail; also use an IAny
-- when instruction is dominated by an equivalent test.
local function codecharset(code, cs, tt, valuetable)
local op, c = charsettype(cs)
if op == IChar then
codechar(code, c, tt)
elseif op == ISet then
assert(tt ~= -1)
if tt >= 0 and code.p[tt].code == ITestSet and
cs_equal(cs, valuetable[code.p[tt].val]) then
addinstruction(code, IAny, 0)
else
coderealcharset(code, cs, valuetable)
end
else
addinstruction(code, op, c)
end
end
-- code a test set, optimizing unit sets for ITestChar, "complete"
-- sets for ITestAny, and empty sets for IJmp (always fails).
-- 'e' is true iff test should accept the empty string. (Test
-- instructions in the current VM never accept the empty string.)
local function codetestset(code, cs, e, valuetable)
if e ~= 0 then
return NOINST -- no test
else
local pos = code.size
codecharset(code, cs, NOINST, valuetable)
local inst = code.p[pos]
local code = inst.code
if code == IFail then
inst.code = IJmp -- always jump
elseif code == IAny then
inst.code = ITestAny
elseif code == IChar then
inst.code = ITestChar
elseif code == ISet then
inst.code = ITestSet
else
assert(false)
end
return pos
end
end
-- Find the final destination of a sequence of jumps
local function finaltarget(code, i)
while code.p[i].code == IJmp do
i = i + code.p[i].offset
end
return i
end
-- final label (after traversing any jumps)
local function finallabel(code, i)
return finaltarget(code, i + code.p[i].offset)
end
-- <behind(p)> == behind n; <p> (where n = fixedlen(p))
local function codebehind(code, tree, index, valuetable)
if tree.p[index].val > 0 then
addinstruction(code, IBehind, tree.p[index].val)
end
codegen(code, tree, fullset, false, NOINST, index + 1, valuetable) -- NOINST
end
-- Choice; optimizations:
-- - when p1 is headfail
-- - when first(p1) and first(p2) are disjoint; than
-- a character not in first(p1) cannot go to p1, and a character
-- in first(p1) cannot go to p2 (at it is not in first(p2)).
-- (The optimization is not valid if p1 accepts the empty string,
-- as then there is no character at all...)
-- - when p2 is empty and opt is true; a IPartialCommit can resuse
-- the Choice already active in the stack.
local function codechoice(code, tree, fl, opt, p1, p2, valuetable)
local emptyp2 = tree.p[p2].tag == TTrue
local e1, st1 = getfirst(tree, fullset, p1, valuetable)
local _, st2 = getfirst(tree, fl, p2, valuetable)
if headfail(tree, p1) or (e1 == 0 and cs_disjoint(st1, st2)) then
-- <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2:
local test = codetestset(code, st1, 0, valuetable)
local jmp = NOINST;
codegen(code, tree, fl, false, test, p1, valuetable)
if not emptyp2 then
jmp = addinstruction(code, IJmp, 0)
end
jumptohere(code, test)
codegen(code, tree, fl, opt, NOINST, p2, valuetable)
jumptohere(code, jmp)
elseif opt and emptyp2 then
-- p1? == IPartialCommit; p1
jumptohere(code, addinstruction(code, IPartialCommit, 0))
codegen(code, tree, fullset, true, NOINST, p1, valuetable)
else
-- <p1 / p2> ==
-- test(fail(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2:
local test = codetestset(code, st1, e1, valuetable)
local pchoice = addinstruction(code, IChoice, 0)
codegen(code, tree, fullset, emptyp2, test, p1, valuetable)
local pcommit = addinstruction(code, ICommit, 0)
jumptohere(code, pchoice)
jumptohere(code, test)
codegen(code, tree, fl, opt, NOINST, p2, valuetable)
jumptohere(code, pcommit)
end
end
-- And predicate
-- optimization: fixedlen(p) = n ==> <&p> == <p>; behind n
-- (valid only when 'p' has no captures)
local function codeand(code, tree, tt, index, valuetable)
local n = fixedlenx(tree, 0, 0, index)
if n >= 0 and n <= MAXBEHINDPREDICATE and not hascaptures(tree, index) then
codegen(code, tree, fullset, false, tt, index, valuetable)
if n > 0 then
addinstruction(code, IBehind, n)
end
else
-- default: Choice L1; p1; BackCommit L2; L1: Fail; L2:
local pchoice = addinstruction(code, IChoice, 0)
codegen(code, tree, fullset, false, tt, index, valuetable)
local pcommit = addinstruction(code, IBackCommit, 0)
jumptohere(code, pchoice)
addinstruction(code, IFail, 0)
jumptohere(code, pcommit)
end
end
-- Captures: if pattern has fixed (and not too big) length, use
-- a single IFullCapture instruction after the match; otherwise,
-- enclose the pattern with OpenCapture - CloseCapture.
local function codecapture(code, tree, fl, tt, index, valuetable)
local len = fixedlenx(tree, 0, 0, index + 1)
if len >= 0 and len <= MAXOFF and not hascaptures(tree, index + 1) then
codegen(code, tree, fl, false, tt, index + 1, valuetable)
addinstcap(code, IFullCapture, tree.p[index].cap, tree.p[index].val, len)
else
addinstcap(code, IOpenCapture, tree.p[index].cap, tree.p[index].val, 0)
codegen(code, tree, fl, false, tt, index + 1, valuetable)
addinstcap(code, ICloseCapture, Cclose, 0, 0)
end
end
local function coderuntime(code, tree, tt, index, valuetable)
addinstcap(code, IOpenCapture, Cgroup, tree.p[index].val, 0)
codegen(code, tree, fullset, false, tt, index + 1, valuetable)
addinstcap(code, ICloseRunTime, Cclose, 0, 0)
end
-- Repetion; optimizations:
-- When pattern is a charset, can use special instruction ISpan.
-- When pattern is head fail, or if it starts with characters that
-- are disjoint from what follows the repetions, a simple test
-- is enough (a fail inside the repetition would backtrack to fail
-- again in the following pattern, so there is no need for a choice).
-- When 'opt' is true, the repetion can reuse the Choice already
-- active in the stack.
local function coderep(code, tree, opt, fl, index, valuetable)
local st = tocharset(tree, index, valuetable)
if st then
local op = coderealcharset(code, st, valuetable)
code.p[op].code = ISpan;
else
local e1, st = getfirst(tree, fullset, index, valuetable)
if headfail(tree, index) or (e1 == 0 and cs_disjoint(st, fl)) then
-- L1: test (fail(p1)) -> L2; <p>; jmp L1; L2:
local test = codetestset(code, st, 0, valuetable)
codegen(code, tree, fullset, false, test, index, valuetable)
local jmp = addinstruction(code, IJmp, 0)
jumptohere(code, test)
jumptothere(code, jmp, test)
else
-- test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2:
-- or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1;
local test = codetestset(code, st, e1, valuetable)
local pchoice = NOINST;
if opt then
jumptohere(code, addinstruction(code, IPartialCommit, 0))
else
pchoice = addinstruction(code, IChoice, 0)
end
local l2 = code.size
codegen(code, tree, fullset, false, NOINST, index, valuetable)
local commit = addinstruction(code, IPartialCommit, 0)
jumptothere(code, commit, l2)
jumptohere(code, pchoice)
jumptohere(code, test)
end
end
end
-- Not predicate; optimizations:
-- In any case, if first test fails, 'not' succeeds, so it can jump to
-- the end. If pattern is headfail, that is all (it cannot fail
-- in other parts); this case includes 'not' of simple sets. Otherwise,
-- use the default code (a choice plus a failtwice).
local function codenot(code, tree, index, valuetable)
local e, st = getfirst(tree, fullset, index, valuetable)
local test = codetestset(code, st, e, valuetable)
-- test (fail(p1)) -> L1; fail; L1:
if headfail(tree, index) then
addinstruction(code, IFail, 0)
else
-- test(fail(p))-> L1; choice L1; <p>; failtwice; L1:
local pchoice = addinstruction(code, IChoice, 0)
codegen(code, tree, fullset, false, NOINST, index, valuetable)
addinstruction(code, IFailTwice, 0)
jumptohere(code, pchoice)
end
jumptohere(code, test)
end
-- change open calls to calls, using list 'positions' to find
-- correct offsets; also optimize tail calls
local function correctcalls(code, positions, from, to)
for i = from, to - 1 do
if code.p[i].code == IOpenCall then
local n = code.p[i].offset; -- rule number
local rule = positions[n]; -- rule position
assert(rule == from or code.p[rule - 1].code == IRet)
-- call; ret ?
if bit.band(code.p[i].val, 0xffff) == 0 and code.p[finaltarget(code, i + 1)].code == IRet then
code.p[i].code = IJmp; -- tail call
else
code.p[i].code = ICall;
end
jumptothere(code, i, rule) -- call jumps to respective rule
end
end
end
-- Code for a grammar:
-- call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2:
local function codegrammar(code, tree, index, valuetable)
local positions = {}
local rulenumber = 1;
-- tree.p[rule].tag
local rule = index + 1
assert(tree.p[rule].tag == TRule)
local LR = 0
if band(RuleLR, tree.p[rule].cap) ~= 0 then LR = 1 end
local firstcall = addinstruction(code, ICall, LR) -- call initial rule
code.p[firstcall].aux = tree.p[rule].val
local jumptoend = addinstruction(code, IJmp, 0) -- jump to the end
jumptohere(code, firstcall) -- here starts the initial rule
while tree.p[rule].tag == TRule do
positions[rulenumber] = code.size -- save rule position
rulenumber = rulenumber + 1
codegen(code, tree, fullset, false, NOINST, rule + 1, valuetable) -- code rule
addinstruction(code, IRet, 0)
rule = rule + tree.p[rule].ps
end
assert(tree.p[rule].tag == TTrue)
jumptohere(code, jumptoend)
correctcalls(code, positions, firstcall + 2, code.size)
end
local function codecall(code, tree, index, val)
local c = addinstruction(code, IOpenCall, tree.p[index].cap) -- to be corrected later
code.p[c].aux = val
assert(tree.p[index + tree.p[index].ps].tag == TRule)
setoffset(code, c, band(tree.p[index + tree.p[index].ps].cap, 0x7fff)) -- offset = rule number
end
local function codeseq(code, tree, fl, opt, tt, p1, p2, valuetable)
if needfollow(tree, p1) then
local _, fll = getfirst(tree, fl, p2, valuetable) -- p1 follow is p2 first
codegen(code, tree, fll, false, tt, p1, valuetable)
else
-- use 'fullset' as follow
codegen(code, tree, fullset, false, tt, p1, valuetable)
end
-- can p1 consume anything?
if (fixedlenx(tree, 0, 0, p1) ~= 0) then
tt = NOINST; -- invalidate test
end
return codegen(code, tree, fl, opt, tt, p2, valuetable)
end
-- Main code-generation function: dispatch to auxiliar functions
-- according to kind of tree
-- code generation is recursive; 'opt' indicates that the code is being
-- generated as the last thing inside an optional pattern (so, if that
-- code is optional too, it can reuse the 'IChoice' already in place for
-- the outer pattern). 'tt' points to a previous test protecting this
-- code (or NOINST). 'fl' is the follow set of the pattern.
function codegen(code, tree, fl, opt, tt, index, valuetable)
local tag = tree.p[index].tag
if tag == TChar then
return codechar(code, tree.p[index].val, tt)
elseif tag == TAny then
return addinstruction(code, IAny, 0)
elseif tag == TSet then
return codecharset(code, valuetable[tree.p[index].val], tt, valuetable)
elseif tag == TTrue then
elseif tag == TFalse then
return addinstruction(code, IFail, 0)
elseif tag == TSeq then
return codeseq(code, tree, fl, opt, tt, index + 1, index + tree.p[index].ps, valuetable)
elseif tag == TChoice then
return codechoice(code, tree, fl, opt, index + 1, index + tree.p[index].ps, valuetable)
elseif tag == TRep then
return coderep(code, tree, opt, fl, index + 1, valuetable)
elseif tag == TBehind then
return codebehind(code, tree, index, valuetable)
elseif tag == TNot then
return codenot(code, tree, index + 1, valuetable)
elseif tag == TAnd then
return codeand(code, tree, tt, index + 1, valuetable)
elseif tag == TCapture then
return codecapture(code, tree, fl, tt, index, valuetable)
elseif tag == TRunTime then
return coderuntime(code, tree, tt, index, valuetable)
elseif tag == TGrammar then
return codegrammar(code, tree, index, valuetable)
elseif tag == TCall then
return codecall(code, tree, index, tree.p[index].val)
else
assert(false)
end
end
-- Optimize jumps and other jump-like instructions.
-- * Update labels of instructions with labels to their final
-- destinations (e.g., choice L1; ... L1: jmp L2: becomes
-- choice L2)
-- * Jumps to other instructions that do jumps become those
-- instructions (e.g., jump to return becomes a return; jump
-- to commit becomes a commit)
local function peephole(code)
local i = 0
while i < code.size do
local tag = code.p[i].code
if tag == IChoice or tag == ICall or tag == ICommit or tag == IPartialCommit or
tag == IBackCommit or tag == ITestChar or tag == ITestSet or tag == ITestAny then
-- instructions with labels
jumptothere(code, i, finallabel(code, i)) -- optimize label
elseif tag == IJmp then
local ft = finaltarget(code, i)
local tag = code.p[ft].code -- jumping to what?
-- instructions with unconditional implicit jumps
if tag == IRet or tag == IFail or tag == IFailTwice or tag == IEnd then
ffi.copy(code.p + i, code.p + ft, ffi.sizeof(patternelement)) -- jump becomes that instruction
elseif tag == ICommit or tag == IPartialCommit or tag == IBackCommit then
-- inst. with unconditional explicit jumps
local fft = finallabel(code, ft)
ffi.copy(code.p + i, code.p + ft, ffi.sizeof(patternelement)) -- jump becomes that instruction...
jumptothere(code, i, fft) -- but must correct its offset
i = i - 1 -- reoptimize its label
else
jumptothere(code, i, ft) -- optimize label
end
end
i = i + 1
end
end
-- Compile a pattern
local function compile(tree, index, valuetable)
local code = pattern()
codegen(code, tree, fullset, false, NOINST, index, valuetable)
addinstruction(code, IEnd, 0)
peephole(code)
ffi.C.free(tree.code)
tree.code = code
end
local function pat_new(ct, size)
size = size or 0
local allocsize = size
if allocsize < 10 then
allocsize = 10
end
local pat = ffi.cast('PATTERN*', ffi.C.malloc(ffi.sizeof(pattern)))
assert(pat ~= nil)
pat.allocsize = allocsize
pat.size = size
pat.p = ffi.C.malloc(ffi.sizeof(patternelement) * allocsize)
assert(pat.p ~= nil)
ffi.fill(pat.p, ffi.sizeof(patternelement) * allocsize)
return pat
end
local function doublesize(ct)
ct.p = ffi.C.realloc(ct.p, ffi.sizeof(patternelement) * ct.allocsize * 2)
assert(ct.p ~= nil)
ffi.fill(ct.p + ct.allocsize, ffi.sizeof(patternelement) * ct.allocsize)
ct.allocsize = ct.allocsize * 2
end
local pattreg = {
doublesize = doublesize,
}
local metareg = {
["__new"] = pat_new,
["__index"] = pattreg
}
ffi.metatype(pattern, metareg)
return {
checkaux = checkaux,
tocharset = tocharset,
fixedlenx = fixedlenx,
hascaptures = hascaptures,
compile = compile,
}