--[[ LPEGLJ lpeglj.lua Main module and tree generation Copyright (C) 2014 Rostislav Sacek. based on LPeg v1.0 - PEG pattern matching for Lua Lua.org & PUC-Rio written by Roberto Ierusalimschy http://www.inf.puc-rio.br/~roberto/lpeg/ ** Permission is hereby granted, free of charge, to any person obtaining ** a copy of this software and associated documentation files (the ** "Software"), to deal in the Software without restriction, including ** without limitation the rights to use, copy, modify, merge, publish, ** distribute, sublicense, and/or sell copies of the Software, and to ** permit persons to whom the Software is furnished to do so, subject to ** the following conditions: ** ** The above copyright notice and this permission notice shall be ** included in all copies or substantial portions of the Software. ** ** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, ** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF ** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. ** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY ** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ** ** [ MIT license: http://www.opensource.org/licenses/mit-license.php ] --]] assert(jit.version_num > 20000, "Use LuaJIT v2.0.1 or higher.") local ffi = require "ffi" local lpcode = require "lpcode" local lpprint = require "lpprint" local lpvm = require "lpvm" local band, bor, bnot, rshift, lshift = bit.band, bit.bor, bit.bnot, bit.rshift, bit.lshift ffi.cdef [[ int isalnum(int c); int isalpha(int c); int iscntrl(int c); int isdigit(int c); int isgraph(int c); int islower(int c); int isprint(int c); int ispunct(int c); int isspace(int c); int isupper(int c); int isxdigit(int c); ]] local MAXBEHIND = 255 local MAXRULES = 200 local VERSION = "1.0.0.0LJ" local TChar = 0 local TSet = 1 local TAny = 2 -- standard PEG elements local TTrue = 3 local TFalse = 4 local TRep = 5 local TSeq = 6 local TChoice = 7 local TNot = 8 local TAnd = 9 local TCall = 10 local TOpenCall = 11 local TRule = 12 -- sib1 is rule's pattern, sib2 is 'next' rule local TGrammar = 13 -- sib1 is initial (and first) rule local TBehind = 14 -- match behind local TCapture = 15 -- regular capture local TRunTime = 16 -- run-time capture local IAny = 0 -- if no char, fail local IChar = 1 -- if char != val, fail local ISet = 2 -- if char not in val, fail local ITestAny = 3 -- in no char, jump to 'offset' local ITestChar = 4 -- if char != val, jump to 'offset' local ITestSet = 5 -- if char not in val, jump to 'offset' local ISpan = 6 -- read a span of chars in val local IBehind = 7 -- walk back 'val' characters (fail if not possible) local IRet = 8 -- return from a rule local IEnd = 9 -- end of pattern local IChoice = 10 -- stack a choice; next fail will jump to 'offset' local IJmp = 11 -- jump to 'offset' local ICall = 12 -- call rule at 'offset' local IOpenCall = 13 -- call rule number 'offset' (must be closed to a ICall) local ICommit = 14 -- pop choice and jump to 'offset' local IPartialCommit = 15 -- update top choice to current position and jump local IBackCommit = 16 -- "fails" but jump to its own 'offset' local IFailTwice = 17 -- pop one choice and then fail local IFail = 18 -- go back to saved state on choice and jump to saved offset local IGiveup = 19 -- internal use local IFullCapture = 20 -- complete capture of last 'off' chars local IOpenCapture = 21 -- start a capture local ICloseCapture = 22 local ICloseRunTime = 23 local Cclose = 0 local Cposition = 1 local Cconst = 2 local Cbackref = 3 local Carg = 4 local Csimple = 5 local Ctable = 6 local Cfunction = 7 local Cquery = 8 local Cstring = 9 local Cnum = 10 local Csubst = 11 local Cfold = 12 local Cruntime = 13 local Cgroup = 14 local PEnullable = 0 local PEnofail = 1 local PEleftrecursion = 2 local newgrammar local RuleLR = 0x10000 local Ruleused = 0x20000 local BCapcandelete = 0x30000 local LREnable = false -- number of siblings for each tree local numsiblings = { 0, 0, 0, -- char, set, any 0, 0, -- true, false 1, -- rep 2, 2, -- seq, choice 1, 1, -- not, and 0, 0, 2, 1, -- call, opencall, rule, grammar 1, -- behind 1, 1 -- capture, runtime capture } local patternid = 0 local valuetable = {} local funcnames = setmetatable({}, { __mode = 'k' }) local treepatternelement = ffi.typeof('TREEPATTERN_ELEMENT') local treepattern = ffi.typeof('TREEPATTERN') local patternelement = ffi.typeof('PATTERN_ELEMENT') local pattern = ffi.typeof('PATTERN') local settype = ffi.typeof('int32_t[8]') local uint32 = ffi.typeof('uint32_t[1]') -- Fix a TOpenCall into a TCall node, using table 'postable' to -- translate a key to its rule address in the tree. Raises an -- error if key does not exist. local function fixonecall(postable, grammar, index, valuetable) local name = valuetable[grammar.p[index].val] -- get rule's name local n = postable[name] -- query name in position table -- no position? if not n then error(("rule '%s' undefined in given grammar"):format(type(name) == 'table' and '(a table)' or name), 0) end grammar.p[index].tag = TCall; grammar.p[index].ps = n - index -- position relative to node grammar.p[index + grammar.p[index].ps].cap = bit.bor(grammar.p[index + grammar.p[index].ps].cap, Ruleused) end -- Transform left associative constructions into right -- associative ones, for sequence and choice; that is: -- (t11 + t12) + t2 => t11 + (t12 + t2) -- (t11 * t12) * t2 => t11 * (t12 * t2) -- (that is, Op (Op t11 t12) t2 => Op t11 (Op t12 t2)) local function correctassociativity(tree, index) local t1 = index + 1 assert(tree.p[index].tag == TChoice or tree.p[index].tag == TSeq) while tree.p[t1].tag == tree.p[index].tag do local n1size = tree.p[index].ps - 1; -- t1 == Op t11 t12 local n11size = tree.p[t1].ps - 1; local n12size = n1size - n11size - 1 for i = 1, n11size do ffi.copy(tree.p + index + i, tree.p + t1 + i, ffi.sizeof(treepatternelement)) end tree.p[index].ps = n11size + 1 tree.p[index + tree.p[index].ps].tag = tree.p[index].tag tree.p[index + tree.p[index].ps].ps = n12size + 1 end end -- Make final adjustments in a tree. Fix open calls in tree, -- making them refer to their respective rules or raising appropriate -- errors (if not inside a grammar). Correct associativity of associative -- constructions (making them right associative). local function finalfix(fix, postable, grammar, index, valuetable) local tag = grammar.p[index].tag --subgrammars were already fixed if tag == TGrammar then return elseif tag == TOpenCall then -- inside a grammar? if fix then fixonecall(postable, grammar, index, valuetable) -- open call outside grammar else error(("rule '%s' used outside a grammar"):format(tostring(valuetable[grammar.p[index].val])), 0) end elseif tag == TSeq or tag == TChoice then correctassociativity(grammar, index) end local ns = numsiblings[tag + 1] if ns == 0 then elseif ns == 1 then return finalfix(fix, postable, grammar, index + 1, valuetable) elseif ns == 2 then finalfix(fix, postable, grammar, index + 1, valuetable) return finalfix(fix, postable, grammar, index + grammar.p[index].ps, valuetable) else assert(false) end end -- {====================================================== -- Tree generation -- ======================================================= local function newcharset() local tree = treepattern(1) valuetable[tree.id] = { settype() } tree.p[0].tag = TSet tree.p[0].val = 1 return tree, valuetable[tree.id][1] end -- add to tree a sequence where first sibling is 'sib' (with size -- 'sibsize') local function seqaux(tree, sib, start, sibsize) tree.p[start].tag = TSeq; tree.p[start].ps = sibsize + 1 ffi.copy(tree.p + start + 1, sib.p, ffi.sizeof(treepatternelement) * sibsize) end -- Build a sequence of 'n' nodes, each with tag 'tag' and 'val' got -- from the array 's' (or 0 if array is NULL). (TSeq is binary, so it -- must build a sequence of sequence of sequence...) local function fillseq(tree, tag, start, n, s) -- initial n-1 copies of Seq tag; Seq ... for i = 1, n - 1 do tree.p[start].tag = TSeq tree.p[start].ps = 2 tree.p[start + 1].tag = tag if s then tree.p[start + 1].val = s:sub(i, i):byte() end start = start + tree.p[start].ps end tree.p[start].tag = tag -- last one does not need TSeq if s then tree.p[start].val = s:sub(n, n):byte() end end -- Numbers as patterns: -- 0 == true (always match); n == TAny repeated 'n' times; -- -n == not (TAny repeated 'n' times) local function numtree(n) if n == 0 then local tree = treepattern(1) tree.p[0].tag = TTrue return tree else local tree, start if n > 0 then tree = treepattern(2 * n - 1) start = 0 -- negative: code it as !(-n) else n = -n; tree = treepattern(2 * n) tree.p[0].tag = TNot start = 1 end fillseq(tree, TAny, start, n) -- sequence of 'n' any's return tree; end end -- Convert value to a pattern local function getpatt(val, name) local typ = type(val) if typ == 'string' then -- empty? if #val == 0 then local pat = treepattern(1) pat.p[0].tag = TTrue -- always match return pat else local tree = treepattern(2 * (#val - 1) + 1) fillseq(tree, TChar, 0, #val, val) -- sequence of '#val' chars return tree end elseif typ == 'number' then return numtree(val) elseif typ == 'boolean' then local pat = treepattern(1) pat.p[0].tag = val and TTrue or TFalse return pat elseif typ == 'table' then return newgrammar(val) elseif typ == 'function' then if name and type(name) == 'string' then funcnames[val] = name end local pat = treepattern(2) valuetable[pat.id] = { val } pat.p[0].tag = TRunTime pat.p[0].val = 1 pat.p[1].tag = TTrue return pat elseif ffi.istype(treepattern, val) then assert(val.treesize > 0) return val end assert(false) end local function copykeys(ktable1, ktable2) local ktable, offset = {}, 0 if not ktable1 and not ktable2 then return ktable, 0 elseif ktable1 then for i = 1, #ktable1 do ktable[#ktable + 1] = ktable1[i] end offset = #ktable1 if not ktable2 then return ktable, 0 end end if ktable2 then for i = 1, #ktable2 do ktable[#ktable + 1] = ktable2[i] end end assert(#ktable < 65536, "too many Lua values in pattern") return ktable, offset end local function correctkeys(tree, index, offset) local tag = tree.p[index].tag if (tag == TSet or tag == TRule or tag == TCall or tag == TRunTime or tag == TOpenCall or tag == TCapture) and tree.p[index].val ~= 0 then tree.p[index].val = tree.p[index].val + offset end local ns = numsiblings[tag + 1] if ns == 0 then elseif ns == 1 then return correctkeys(tree, index + 1, offset) elseif ns == 2 then correctkeys(tree, index + 1, offset) return correctkeys(tree, index + tree.p[index].ps, offset) else assert(false) end end -- create a new tree, with a new root and one sibling. local function newroot1sib(tag, pat) local tree1 = getpatt(pat) local tree = treepattern(1 + tree1.treesize) -- create new tree valuetable[tree.id] = copykeys(valuetable[tree1.id]) tree.p[0].tag = tag ffi.copy(tree.p + 1, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize) return tree end -- create a new tree, with a new root and 2 siblings. local function newroot2sib(tag, pat1, pat2) local tree1 = getpatt(pat1) local tree2 = getpatt(pat2) local tree = treepattern(1 + tree1.treesize + tree2.treesize) -- create new tree local ktable, offset = copykeys(valuetable[tree1.id], valuetable[tree2.id]) valuetable[tree.id] = ktable tree.p[0].tag = tag tree.p[0].ps = 1 + tree1.treesize ffi.copy(tree.p + 1, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize) ffi.copy(tree.p + 1 + tree1.treesize, tree2.p, ffi.sizeof(treepatternelement) * tree2.treesize) if offset > 0 then correctkeys(tree, 1 + tree1.treesize, offset) end return tree; end local function lp_P(val, name) assert(type(val) ~= 'nil') return getpatt(val, name) end -- sequence operator; optimizations: -- false x => false, x true => x, true x => x -- (cannot do x . false => false because x may have runtime captures) local function lp_seq(pat1, pat2) local tree1 = getpatt(pat1) local tree2 = getpatt(pat2) -- false . x == false, x . true = x if tree1.p[0].tag == TFalse or tree2.p[0].tag == TTrue then return tree1 -- true . x = x elseif tree1.p[0].tag == TTrue then return tree2 else return newroot2sib(TSeq, tree1, tree2) end end -- choice operator; optimizations: -- charset / charset => charset -- true / x => true, x / false => x, false / x => x -- (x / true is not equivalent to true) local function lp_choice(pat1, pat2) local tree1 = getpatt(pat1) local tree2 = getpatt(pat2) local charset1 = lpcode.tocharset(tree1, 0, valuetable[tree1.id]) local charset2 = lpcode.tocharset(tree2, 0, valuetable[tree2.id]) if charset1 and charset2 then local t, set = newcharset() for i = 0, 7 do set[i] = bor(charset1[i], charset2[i]) end return t elseif lpcode.checkaux(tree1, PEnofail, 0) or tree2.p[0].tag == TFalse then return tree1 -- true / x => true, x / false => x elseif tree1.p[0].tag == TFalse then return tree2 -- false / x => x else return newroot2sib(TChoice, tree1, tree2) end end -- p^n local function lp_star(tree1, n) local tree n = tonumber(n) assert(type(n) == 'number') -- seq tree1 (seq tree1 ... (seq tree1 (rep tree1))) if n >= 0 then tree = treepattern((n + 1) * (tree1.treesize + 1)) if lpcode.checkaux(tree1, PEnullable, 0) then error("loop body may accept empty string", 0) end valuetable[tree.id] = copykeys(valuetable[tree1.id]) local start = 0 -- repeat 'n' times for i = 1, n do seqaux(tree, tree1, start, tree1.treesize) start = start + tree.p[start].ps end tree.p[start].tag = TRep ffi.copy(tree.p + start + 1, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize) -- choice (seq tree1 ... choice tree1 true ...) true else n = -n; -- size = (choice + seq + tree1 + true) * n, but the last has no seq tree = treepattern(n * (tree1.treesize + 3) - 1) valuetable[tree.id] = copykeys(valuetable[tree1.id]) local start = 0 -- repeat (n - 1) times for i = n, 2, -1 do tree.p[start].tag = TChoice; tree.p[start].ps = i * (tree1.treesize + 3) - 2 tree.p[start + tree.p[start].ps].tag = TTrue; start = start + 1 seqaux(tree, tree1, start, tree1.treesize) start = start + tree.p[start].ps end tree.p[start].tag = TChoice; tree.p[start].ps = tree1.treesize + 1 tree.p[start + tree.p[start].ps].tag = TTrue ffi.copy(tree.p + start + 1, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize) end return tree end -- #p == &p local function lp_and(pat) return newroot1sib(TAnd, pat) end -- -p == !p local function lp_not(pat) return newroot1sib(TNot, pat) end -- [t1 - t2] == Seq (Not t2) t1 -- If t1 and t2 are charsets, make their difference. local function lp_sub(pat1, pat2) local tree1 = getpatt(pat1) local tree2 = getpatt(pat2) local charset1 = lpcode.tocharset(tree1, 0, valuetable[tree1.id]) local charset2 = lpcode.tocharset(tree2, 0, valuetable[tree2.id]) if charset1 and charset2 then local tree, set = newcharset() for i = 0, 7 do set[i] = band(charset1[i], bnot(charset2[i])) end return tree else local tree = treepattern(2 + tree1.treesize + tree2.treesize) local ktable, offset = copykeys(valuetable[tree2.id], valuetable[tree1.id]) valuetable[tree.id] = ktable tree.p[0].tag = TSeq; -- sequence of... tree.p[0].ps = 2 + tree2.treesize tree.p[1].tag = TNot; -- ...not... ffi.copy(tree.p + 2, tree2.p, ffi.sizeof(treepatternelement) * tree2.treesize) ffi.copy(tree.p + tree2.treesize + 2, tree1.p, ffi.sizeof(treepatternelement) * tree1.treesize) if offset > 0 then correctkeys(tree, 2 + tree2.treesize, offset) end return tree end end local function lp_set(val) assert(type(val) == 'string') local tree, set = newcharset() for i = 1, #val do local b = val:sub(i, i):byte() set[rshift(b, 5)] = bor(set[rshift(b, 5)], lshift(1, band(b, 31))) end return tree end local function lp_range(...) local args = { ... } local top = #args local tree, set = newcharset() for i = 1, top do assert(#args[i] == 2, args[i] .. " range must have two characters") for b = args[i]:sub(1, 1):byte(), args[i]:sub(2, 2):byte() do set[rshift(b, 5)] = bor(set[rshift(b, 5)], lshift(1, band(b, 31))) end end return tree end -- Look-behind predicate local function lp_behind(pat) local tree1 = getpatt(pat) local n = lpcode.fixedlenx(tree1, 0, 0, 0) assert(not lpcode.hascaptures(tree1, 0), "pattern have captures") assert(n >= 0, "pattern may not have fixed length") assert(n <= MAXBEHIND, "pattern too long to look behind") local tree = newroot1sib(TBehind, pat) tree.p[0].val = n; return tree end -- Create a non-terminal local function lp_V(val, p) assert(val, "non-nil value expected") local tree = treepattern(1) valuetable[tree.id] = { val } tree.p[0].tag = TOpenCall tree.p[0].val = 1 tree.p[0].cap = p or 0 return tree end -- Create a tree for a non-empty capture, with a body and -- optionally with an associated value local function capture_aux(cap, pat, val) local tree = newroot1sib(TCapture, pat) tree.p[0].cap = cap if val then local ind = #valuetable[tree.id] + 1 assert(ind <= 65536, "too many Lua values in pattern" .. ind) valuetable[tree.id][ind] = val tree.p[0].val = ind end return tree end -- Fill a tree with an empty capture, using an empty (TTrue) sibling. local function auxemptycap(tree, cap, par, start) tree.p[start].tag = TCapture; tree.p[start].cap = cap if type(par) ~= 'nil' then local ind = #valuetable[tree.id] + 1 assert(ind <= 65536, "too many Lua values in pattern") valuetable[tree.id][ind] = par tree.p[start].val = ind end tree.p[start + 1].tag = TTrue; end -- Create a tree for an empty capture local function newemptycap(cap, par) local tree = treepattern(2) if type(par) ~= 'nil' then valuetable[tree.id] = {} end auxemptycap(tree, cap, par, 0) return tree end -- Captures with syntax p / v -- (function capture, query capture, string capture, or number capture) local function lp_divcapture(pat, par, xxx) local typ = type(par) if typ == "function" then return capture_aux(Cfunction, pat, par) elseif typ == "table" then return capture_aux(Cquery, pat, par) elseif typ == "string" then return capture_aux(Cstring, pat, par) elseif typ == "number" then local tree = newroot1sib(TCapture, pat) assert(0 <= par and par <= 0xffff, "invalid number") tree.p[0].cap = Cnum; local ind = #valuetable[tree.id] + 1 assert(ind <= 65536, "too many Lua values in pattern") valuetable[tree.id][ind] = par tree.p[0].val = ind return tree else error("invalid replacement value", 0) end end local function lp_substcapture(pat) return capture_aux(Csubst, pat) end local function lp_tablecapture(pat) return capture_aux(Ctable, pat, 0) end local function lp_groupcapture(pat, val) if not val then return capture_aux(Cgroup, pat) else return capture_aux(Cgroup, pat, val) end end local function lp_foldcapture(pat, fce) assert(type(fce) == 'function') return capture_aux(Cfold, pat, fce) end local function lp_simplecapture(pat) return capture_aux(Csimple, pat) end local function lp_poscapture() return newemptycap(Cposition) end local function lp_argcapture(val) assert(type(val) == 'number') local tree = newemptycap(Carg, 0) local ind = #valuetable[tree.id] + 1 assert(ind <= 65536, "too many Lua values in pattern") valuetable[tree.id][ind] = val tree.p[0].val = ind assert(0 < val and val <= 0xffff, "invalid argument index") return tree end local function lp_backref(val) return newemptycap(Cbackref, val) end -- Constant capture local function lp_constcapture(...) local tree local args = { ... } local n = select('#', ...) -- number of values -- no values? if n == 0 then tree = treepattern(1) -- no capture tree.p[0].tag = TTrue elseif n == 1 then tree = newemptycap(Cconst, args[1]) -- single constant capture -- create a group capture with all values else tree = treepattern(3 + 3 * (n - 1)) valuetable[tree.id] = {} tree.p[0].tag = TCapture tree.p[0].cap = Cgroup local start = 1 for i = 1, n - 1 do tree.p[start].tag = TSeq tree.p[start].ps = 3 auxemptycap(tree, Cconst, args[i], start + 1) start = start + tree.p[start].ps end auxemptycap(tree, Cconst, args[n], start) end return tree end local function lp_matchtime(pat, fce, name) assert(type(fce) == 'function') if name and type(name) == 'string' then funcnames[fce] = name end local tree = newroot1sib(TRunTime, pat) local ind = #valuetable[tree.id] + 1 assert(ind <= 65536, "too many Lua values in pattern") valuetable[tree.id][ind] = fce tree.p[0].val = ind return tree end -- ====================================================== -- ====================================================== -- Grammar - Tree generation -- ======================================================= -- return index and the pattern for the -- initial rule of grammar; -- also add that index into position table. local function getfirstrule(pat, postab) local key -- access first element if type(pat[1]) == 'string' then key = pat[1] else key = 1 end local rule = pat[key] if not rule then error("grammar has no initial rule", 0) end -- initial rule not a pattern? if not ffi.istype(treepattern, rule) then error(("initial rule '%s' is not a pattern"):format(tostring(key)), 0) end postab[key] = 1 return key, rule end -- traverse grammar, collect all its keys and patterns -- into rule table. Create a new table (before all pairs key-pattern) to -- collect all keys and their associated positions in the final tree -- (the "position table"). -- Return the number of rules and the total size -- for the new tree. local function collectrules(pat) local n = 1; -- to count number of rules local postab = {} local firstkeyrule, firstrule = getfirstrule(pat, postab) local rules = { firstkeyrule, firstrule } local size = 2 + firstrule.treesize -- TGrammar + TRule + rule for key, val in pairs(pat) do -- initial rule? if key ~= 1 and tostring(val) ~= tostring(firstrule) then -- value is not a pattern? if not ffi.istype(treepattern, val) then error(("rule '%s' is not a pattern"):format(tostring(key)), 0) end rules[#rules + 1] = key rules[#rules + 1] = val postab[key] = size size = 1 + size + val.treesize n = n + 1 end end size = size + 1; -- TTrue to finish list of rules return n, size, rules, postab end local function buildgrammar(grammar, rules, n, index, valuetable) local ktable, offset = {}, 0 -- add each rule into new tree for i = 1, n do local size = rules[i * 2].treesize grammar.p[index].tag = TRule; grammar.p[index].cap = i; -- rule number grammar.p[index].ps = size + 1; -- point to next rule local ind = #ktable + 1 ktable[ind] = rules[i * 2 - 1] grammar.p[index].val = ind ffi.copy(grammar.p + index + 1, rules[i * 2].p, ffi.sizeof(treepatternelement) * size) -- copy rule ktable, offset = copykeys(ktable, valuetable[rules[i * 2].id]) if offset > 0 then correctkeys(grammar, index + 1, offset) end index = index + grammar.p[index].ps; -- move to next rule end grammar.p[index].tag = TTrue; -- finish list of rules return ktable end -- Check whether a tree has potential infinite loops local function checkloops(tree, index) local tag = tree.p[index].tag if tag == TRep and lpcode.checkaux(tree, PEnullable, index + 1) then return true elseif tag == TGrammar then return -- sub-grammars already checked else local tag = numsiblings[tree.p[index].tag + 1] if tag == 0 then return elseif tag == 1 then return checkloops(tree, index + 1) elseif tag == 2 then if checkloops(tree, index + 1) then return true else return checkloops(tree, index + tree.p[index].ps) end else assert(false) end end end -- Check whether a rule can be left recursive; returns PEleftrecursion in that -- case; otherwise return 1 iff pattern is nullable. local function verifyrule(rulename, tree, passed, nullable, index, valuetable) local tag = tree.p[index].tag if tag == TChar or tag == TSet or tag == TAny or tag == TFalse then return nullable; -- cannot pass from here elseif tag == TTrue or tag == TBehind then return true; elseif tag == TNot or tag == TAnd or tag == TRep then return verifyrule(rulename, tree, passed, true, index + 1, valuetable) elseif tag == TCapture or tag == TRunTime then return verifyrule(rulename, tree, passed, nullable, index + 1, valuetable) elseif tag == TCall then local rule = valuetable[tree.p[index].val] if rule == rulename then return PEleftrecursion end if passed[rule] and passed[rule] > MAXRULES then return nullable end return verifyrule(rulename, tree, passed, nullable, index + tree.p[index].ps, valuetable) -- only check 2nd child if first is nullable elseif tag == TSeq then local res = verifyrule(rulename, tree, passed, false, index + 1, valuetable) if res == PEleftrecursion then return res elseif not res then return nullable else return verifyrule(rulename, tree, passed, nullable, index + tree.p[index].ps, valuetable) end -- must check both children elseif tag == TChoice then nullable = verifyrule(rulename, tree, passed, nullable, index + 1, valuetable) if nullable == PEleftrecursion then return nullable end return verifyrule(rulename, tree, passed, nullable, index + tree.p[index].ps, valuetable) elseif tag == TRule then local rule = valuetable[tree.p[index].val] passed[rule] = (passed[rule] or 0) + 1 return verifyrule(rulename, tree, passed, nullable, index + 1, valuetable) elseif tag == TGrammar then return lpcode.checkaux(tree, PEnullable, index) -- sub-grammar cannot be left recursive else assert(false) end end local function verifygrammar(rule, index, valuetable) -- check left-recursive rules local LR = {} local ind = index + 1 while rule.p[ind].tag == TRule do local rulename = valuetable[rule.p[ind].val] -- used rule if rulename then if verifyrule(rulename, rule, {}, false, ind + 1, valuetable) == PEleftrecursion then if not LREnable then error(("rule '%s' may be left recursive"):format(rulename), 0) end LR[rulename] = true end end ind = ind + rule.p[ind].ps end assert(rule.p[ind].tag == TTrue) for i = 0, rule.treesize - 1 do if rule.p[i].tag == TRule and LR[valuetable[rule.p[i].val]] then rule.p[i].cap = bor(rule.p[i].cap, RuleLR) --TRule can be left recursive end if rule.p[i].tag == TCall and LR[valuetable[rule.p[i].val]] then if rule.p[i].cap == 0 then rule.p[i].cap = 1 --TCall can be left recursive end end end -- check infinite loops inside rules ind = index + 1 while rule.p[ind].tag == TRule do -- used rule if rule.p[ind].val then if checkloops(rule, ind + 1) then error(("empty loop in rule '%s'"):format(tostring(valuetable[rule.p[ind].val])), 0) end end ind = ind + rule.p[ind].ps end assert(rule.p[ind].tag == TTrue) end -- Give a name for the initial rule if it is not referenced local function initialrulename(grammar, val, valuetable) grammar.p[1].cap = bit.bor(grammar.p[1].cap, Ruleused) -- initial rule is not referenced? if grammar.p[1].val == 0 then local ind = #valuetable + 1 assert(ind <= 65536, "too many Lua values in pattern") valuetable[ind] = val grammar.p[1].val = ind end end function newgrammar(pat) -- traverse grammar. Create a new table (before all pairs key-pattern) to -- collect all keys and their associated positions in the final tree -- (the "position table"). -- Return new tree. local n, size, rules, postab = collectrules(pat) local grammar = treepattern(size) local start = 0 grammar.p[start].tag = TGrammar grammar.p[start].val = n valuetable[grammar.id] = buildgrammar(grammar, rules, n, start + 1, valuetable) finalfix(true, postab, grammar, start + 1, valuetable[grammar.id]) initialrulename(grammar, rules[1], valuetable[grammar.id]) verifygrammar(grammar, 0, valuetable[grammar.id]) return grammar end -- ====================================================== -- remove duplicity from value table local function reducevaluetable(p) local vtable = valuetable[p.id] local value = {} local newvaluetable = {} local function check(v) if v > 0 then local ord = value[vtable[v]] if not ord then newvaluetable[#newvaluetable + 1] = vtable[v] ord = #newvaluetable value[vtable[v]] = ord end return ord end return 0 end local function itertree(p, index) local tag = p.p[index].tag if tag == TSet or tag == TCall or tag == TOpenCall or tag == TRule or tag == TCapture or tag == TRunTime then p.p[index].val = check(p.p[index].val) end local ns = numsiblings[tag + 1] if ns == 0 then elseif ns == 1 then return itertree(p, index + 1) elseif ns == 2 then itertree(p, index + 1) return itertree(p, index + p.p[index].ps) else assert(false) end end if p.treesize > 0 then itertree(p, 0) end if p.code ~= nil then for i = 0, p.code.size - 1 do local code = p.code.p[i].code if code == ICall or code == IJmp then p.code.p[i].aux = check(p.code.p[i].aux) elseif code == ISet or code == ITestSet or code == ISpan then p.code.p[i].val = check(p.code.p[i].val) elseif code == IOpenCapture or code == IFullCapture then p.code.p[i].offset = check(p.code.p[i].offset) end end end valuetable[p.id] = newvaluetable end local function checkalt(tree) local notchecked = {} local notinalternativerules = {} local function iter(tree, index, choice, rule) local tag = tree[index].tag if tag == TCapture and bit.band(tree[index].cap, 0xffff) == Cgroup then if not choice then if rule then notchecked[rule] = index end else tree[index].cap = bit.bor(tree[index].cap, BCapcandelete) end elseif tag == TChoice then choice = true elseif tag == TRule then rule = tree[index].val if bit.band(tree[index].cap, 0xffff) - 1 == 0 then notinalternativerules[rule] = notinalternativerules[rule] or true end elseif tag == TCall then local r = tree[index].val if not choice then notinalternativerules[r] = notinalternativerules[r] or true end end local sibs = numsiblings[tree[index].tag + 1] or 0 if sibs >= 1 then iter(tree, index + 1, choice, rule) if sibs >= 2 then return iter(tree, index + tree[index].ps, choice, rule) end end end iter(tree, 0) for k, v in pairs(notchecked) do if not notinalternativerules[k] then tree[v].cap = bit.bor(tree[v].cap, BCapcandelete) end end end local function prepcompile(p, index) finalfix(false, nil, p, index, valuetable[p.id]) checkalt(p.p) lpcode.compile(p, index, valuetable[p.id]) reducevaluetable(p) return p.code end local function lp_printtree(pat, c) assert(pat.treesize > 0) if c then finalfix(false, nil, pat, 0, valuetable[pat.id]) end lpprint.printtree(pat.p, 0, 0, valuetable[pat.id]) end local function lp_printcode(pat) -- not compiled yet? if pat.code == nil then prepcompile(pat, 0) end lpprint.printpatt(pat.code, valuetable[pat.id]) end -- Main match function local function lp_match(pat, s, init, ...) local p = ffi.istype(treepattern, pat) and pat or getpatt(pat) p.code = p.code ~= nil and p.code or prepcompile(p, 0) return lpvm.match(p, s, init, valuetable[p.id], ...) end local function lp_streammatch(pat, init, ...) local p = ffi.istype(treepattern, pat) and pat or getpatt(pat) p.code = p.code ~= nil and p.code or prepcompile(p, 0) return lpvm.streammatch(p, init, valuetable[p.id], ...) end -- Only for testing purpose -- stream emulation (send all chars from string one char after char) local function lp_emulatestreammatch(pat, s, init, ...) local p = ffi.istype(treepattern, pat) and pat or getpatt(pat) p.code = p.code ~= nil and p.code or prepcompile(p, 0) return lpvm.emulatestreammatch(p, s, init, valuetable[p.id], ...) end -- {====================================================== -- Library creation and functions not related to matching -- ======================================================= local function lp_setmax(val) lpvm.setmax(val) end local function lp_setmaxbehind(val) lpvm.setmaxbehind(val) end local function lp_enableleftrecursion(val) LREnable = val end local function lp_version() return VERSION end local function lp_type(pat) if ffi.istype(treepattern, pat) then return "pattern" end end local function createcat(tab, catname, catfce) local t, set = newcharset() for i = 0, 255 do if catfce(i) ~= 0 then set[rshift(i, 5)] = bor(set[rshift(i, 5)], lshift(1, band(i, 31))) end end tab[catname] = t end local function lp_locale(tab) tab = tab or {} createcat(tab, "alnum", function(c) return ffi.C.isalnum(c) end) createcat(tab, "alpha", function(c) return ffi.C.isalpha(c) end) createcat(tab, "cntrl", function(c) return ffi.C.iscntrl(c) end) createcat(tab, "digit", function(c) return ffi.C.isdigit(c) end) createcat(tab, "graph", function(c) return ffi.C.isgraph(c) end) createcat(tab, "lower", function(c) return ffi.C.islower(c) end) createcat(tab, "print", function(c) return ffi.C.isprint(c) end) createcat(tab, "punct", function(c) return ffi.C.ispunct(c) end) createcat(tab, "space", function(c) return ffi.C.isspace(c) end) createcat(tab, "upper", function(c) return ffi.C.isupper(c) end) createcat(tab, "xdigit", function(c) return ffi.C.isxdigit(c) end) return tab end local function lp_new(ct, size) local pat = ffi.new(ct, size) pat.treesize = size patternid = patternid + 1 pat.id = patternid return pat end local function lp_gc(ct) valuetable[ct.id] = nil if ct.code ~= nil then ffi.C.free(ct.code.p) ffi.C.free(ct.code) end end local function lp_eq(ct1, ct2) return tostring(ct1) == tostring(ct2) end local function lp_load(str, fcetab) local pat, t = lpvm.load(str, fcetab, true) valuetable[pat.id] = t return pat end local function lp_loadfile(fname, fcetab) local pat, t = lpvm.loadfile(fname, fcetab, true) valuetable[pat.id] = t return pat end local function lp_dump(ct, tree) local funccount = 0 -- not compiled yet? if ct.code == nil then prepcompile(ct, 0) end local out = {} if tree then out[#out + 1] = ffi.string(uint32(ct.treesize), 4) out[#out + 1] = ffi.string(ct.p, ffi.sizeof(treepatternelement) * ct.treesize) else out[#out + 1] = ffi.string(uint32(0), 4) end out[#out + 1] = ffi.string(uint32(ct.code.size), 4) out[#out + 1] = ffi.string(ct.code.p, ct.code.size * ffi.sizeof(patternelement)) local t = valuetable[ct.id] local len = t and #t or 0 out[#out + 1] = ffi.string(uint32(len), 4) if len > 0 then for _, val in ipairs(t) do local typ = type(val) if typ == 'string' then out[#out + 1] = 'str' out[#out + 1] = ffi.string(uint32(#val), 4) out[#out + 1] = val elseif typ == 'number' then local val = tostring(val) out[#out + 1] = 'num' out[#out + 1] = ffi.string(uint32(#val), 4) out[#out + 1] = val elseif typ == 'cdata' then out[#out + 1] = 'cdt' out[#out + 1] = ffi.string(val, ffi.sizeof(val)) elseif typ == 'function' then out[#out + 1] = 'fnc' funccount = funccount + 1 local name = funcnames[val] or ('FNAME%03d'):format(funccount) out[#out + 1] = ffi.string(uint32(#name), 4) out[#out + 1] = name if not funcnames[val] and debug.getupvalue(val, 1) then io.write(("Patterns function (%d) contains upvalue (%s) - use symbol name for function (%s).\n"):format(funccount, debug.getupvalue(val, 1), name), 0) end local data = string.dump(val, true) out[#out + 1] = ffi.string(uint32(#data), 4) out[#out + 1] = data else error(("Type '%s' NYI for dump"):format(typ), 0) end end end return table.concat(out) end local function lp_save(ct, fname, tree) local file = assert(io.open(fname, 'wb')) file:write(lp_dump(ct, tree)) file:close() end local pattreg = { ["ptree"] = lp_printtree, ["pcode"] = lp_printcode, ["match"] = lp_match, ["streammatch"] = lp_streammatch, ["emulatestreammatch"] = lp_emulatestreammatch, ["setmaxbehind"] = lp_setmaxbehind, ["B"] = lp_behind, ["V"] = lp_V, ["C"] = lp_simplecapture, ["Cc"] = lp_constcapture, ["Cmt"] = lp_matchtime, ["Cb"] = lp_backref, ["Carg"] = lp_argcapture, ["Cp"] = lp_poscapture, ["Cs"] = lp_substcapture, ["Ct"] = lp_tablecapture, ["Cf"] = lp_foldcapture, ["Cg"] = lp_groupcapture, ["P"] = lp_P, ["S"] = lp_set, ["R"] = lp_range, ["L"] = lp_and, ["locale"] = lp_locale, ["version"] = lp_version, ["setmaxstack"] = lp_setmax, ["type"] = lp_type, ["enableleftrecursion"] = lp_enableleftrecursion, ["enablememoization"] = lpvm.enablememoization, ["enabletracing"] = lpvm.enabletracing, ["save"] = lp_save, ["dump"] = lp_dump, ["load"] = lp_load, ["loadfile"] = lp_loadfile, ["__mul"] = lp_seq, ["__add"] = lp_choice, ["__pow"] = lp_star, ["__len"] = lp_and, ["__div"] = lp_divcapture, ["__unm"] = lp_not, ["__sub"] = lp_sub, } local metareg = { ["__gc"] = lp_gc, ["__new"] = lp_new, ["__mul"] = lp_seq, ["__add"] = lp_choice, ["__pow"] = lp_star, ["__len"] = lp_and, ["__div"] = lp_divcapture, ["__unm"] = lp_not, ["__sub"] = lp_sub, ["__eq"] = lp_eq, ["__index"] = pattreg } ffi.metatype(treepattern, metareg) return pattreg