TombEngine/Documentation/compiler/pl/stringx.lua

918 lines
26 KiB
Lua
Raw Permalink Normal View History

--- Python-style extended string library.
--
-- see 3.6.1 of the Python reference.
-- If you want to make these available as string methods, then say
-- `stringx.import()` to bring them into the standard `string` table.
--
-- See @{03-strings.md|the Guide}
--
-- Dependencies: `pl.utils`, `pl.types`
-- @module pl.stringx
local utils = require 'pl.utils'
local is_callable = require 'pl.types'.is_callable
local string = string
local find = string.find
local type,setmetatable,ipairs = type,setmetatable,ipairs
local error = error
local gsub = string.gsub
local rep = string.rep
local sub = string.sub
local reverse = string.reverse
local concat = table.concat
local append = table.insert
local remove = table.remove
local escape = utils.escape
local ceil, max = math.ceil, math.max
local assert_arg,usplit = utils.assert_arg,utils.split
local lstrip
local unpack = utils.unpack
local pack = utils.pack
local function assert_string (n,s)
assert_arg(n,s,'string')
end
local function non_empty(s)
return #s > 0
end
local function assert_nonempty_string(n,s)
assert_arg(n,s,'string',non_empty,'must be a non-empty string')
end
local function makelist(l)
return setmetatable(l, require('pl.List'))
end
local stringx = {}
------------------
-- String Predicates
-- @section predicates
--- does s only contain alphabetic characters?
-- @string s a string
function stringx.isalpha(s)
assert_string(1,s)
return find(s,'^%a+$') == 1
end
--- does s only contain digits?
-- @string s a string
function stringx.isdigit(s)
assert_string(1,s)
return find(s,'^%d+$') == 1
end
--- does s only contain alphanumeric characters?
-- @string s a string
function stringx.isalnum(s)
assert_string(1,s)
return find(s,'^%w+$') == 1
end
--- does s only contain whitespace?
-- Matches on pattern '%s' so matches space, newline, tabs, etc.
-- @string s a string
function stringx.isspace(s)
assert_string(1,s)
return find(s,'^%s+$') == 1
end
--- does s only contain lower case characters?
-- @string s a string
function stringx.islower(s)
assert_string(1,s)
return find(s,'^[%l%s]+$') == 1
end
--- does s only contain upper case characters?
-- @string s a string
function stringx.isupper(s)
assert_string(1,s)
return find(s,'^[%u%s]+$') == 1
end
local function raw_startswith(s, prefix)
return find(s,prefix,1,true) == 1
end
local function raw_endswith(s, suffix)
return #s >= #suffix and find(s, suffix, #s-#suffix+1, true) and true or false
end
local function test_affixes(s, affixes, fn)
if type(affixes) == 'string' then
return fn(s,affixes)
elseif type(affixes) == 'table' then
for _,affix in ipairs(affixes) do
if fn(s,affix) then return true end
end
return false
else
error(("argument #2 expected a 'string' or a 'table', got a '%s'"):format(type(affixes)))
end
end
--- does s start with prefix or one of prefixes?
-- @string s a string
-- @param prefix a string or an array of strings
function stringx.startswith(s,prefix)
assert_string(1,s)
return test_affixes(s,prefix,raw_startswith)
end
--- does s end with suffix or one of suffixes?
-- @string s a string
-- @param suffix a string or an array of strings
function stringx.endswith(s,suffix)
assert_string(1,s)
return test_affixes(s,suffix,raw_endswith)
end
--- Strings and Lists
-- @section lists
--- concatenate the strings using this string as a delimiter.
-- Note that the arguments are reversed from `string.concat`.
-- @string s the string
-- @param seq a table of strings or numbers
-- @usage stringx.join(' ', {1,2,3}) == '1 2 3'
function stringx.join(s,seq)
assert_string(1,s)
return concat(seq,s)
end
--- Split a string into a list of lines.
-- `"\r"`, `"\n"`, and `"\r\n"` are considered line ends.
-- They are not included in the lines unless `keepends` is passed.
-- Terminal line end does not produce an extra line.
-- Splitting an empty string results in an empty list.
-- @string s the string.
-- @bool[opt] keep_ends include line ends.
-- @return List of lines
function stringx.splitlines(s, keep_ends)
assert_string(1, s)
local res = {}
local pos = 1
while true do
local line_end_pos = find(s, '[\r\n]', pos)
if not line_end_pos then
break
end
local line_end = sub(s, line_end_pos, line_end_pos)
if line_end == '\r' and sub(s, line_end_pos + 1, line_end_pos + 1) == '\n' then
line_end = '\r\n'
end
local line = sub(s, pos, line_end_pos - 1)
if keep_ends then
line = line .. line_end
end
append(res, line)
pos = line_end_pos + #line_end
end
if pos <= #s then
append(res, sub(s, pos))
end
return makelist(res)
end
--- split a string into a list of strings using a delimiter.
-- @function split
-- @string s the string
-- @string[opt] re a delimiter (defaults to whitespace)
-- @int[opt] n maximum number of results
-- @return List
-- @usage #(stringx.split('one two')) == 2
-- @usage stringx.split('one,two,three', ',') == List{'one','two','three'}
-- @usage stringx.split('one,two,three', ',', 2) == List{'one','two,three'}
function stringx.split(s,re,n)
assert_string(1,s)
local plain = true
if not re then -- default spaces
s = lstrip(s)
plain = false
end
local res = usplit(s,re,plain,n)
if re and re ~= '' and
find(s,re,-#re,true) and
(n or math.huge) > #res then
res[#res+1] = ""
end
return makelist(res)
end
--- replace all tabs in s with tabsize spaces. If not specified, tabsize defaults to 8.
-- Tab stops will be honored.
-- @string s the string
-- @int tabsize[opt=8] number of spaces to expand each tab
-- @return expanded string
-- @usage stringx.expandtabs('\tone,two,three', 4) == ' one,two,three'
-- @usage stringx.expandtabs(' \tone,two,three', 4) == ' one,two,three'
function stringx.expandtabs(s,tabsize)
assert_string(1,s)
tabsize = tabsize or 8
return (s:gsub("([^\t\r\n]*)\t", function(before_tab)
if tabsize == 0 then
return before_tab
else
return before_tab .. (" "):rep(tabsize - #before_tab % tabsize)
end
end))
end
--- Finding and Replacing
-- @section find
local function _find_all(s,sub,first,last,allow_overlap)
first = first or 1
last = last or #s
if sub == '' then return last+1,last-first+1 end
local i1,i2 = find(s,sub,first,true)
local res
local k = 0
while i1 do
if last and i2 > last then break end
res = i1
k = k + 1
if allow_overlap then
i1,i2 = find(s,sub,i1+1,true)
else
i1,i2 = find(s,sub,i2+1,true)
end
end
return res,k
end
--- find index of first instance of sub in s from the left.
-- @string s the string
-- @string sub substring
-- @int[opt] first first index
-- @int[opt] last last index
-- @return start index, or nil if not found
function stringx.lfind(s,sub,first,last)
assert_string(1,s)
assert_string(2,sub)
local i1, i2 = find(s,sub,first,true)
if i1 and (not last or i2 <= last) then
return i1
else
return nil
end
end
--- find index of first instance of sub in s from the right.
-- @string s the string
-- @string sub substring
-- @int[opt] first first index
-- @int[opt] last last index
-- @return start index, or nil if not found
function stringx.rfind(s,sub,first,last)
assert_string(1,s)
assert_string(2,sub)
return (_find_all(s,sub,first,last,true))
end
--- replace up to n instances of old by new in the string s.
-- If n is not present, replace all instances.
-- @string s the string
-- @string old the target substring
-- @string new the substitution
-- @int[opt] n optional maximum number of substitutions
-- @return result string
function stringx.replace(s,old,new,n)
assert_string(1,s)
assert_string(2,old)
assert_string(3,new)
return (gsub(s,escape(old),new:gsub('%%','%%%%'),n))
end
--- count all instances of substring in string.
-- @string s the string
-- @string sub substring
-- @bool[opt] allow_overlap allow matches to overlap
-- @usage
-- assert(stringx.count('banana', 'ana') == 1)
-- assert(stringx.count('banana', 'ana', true) == 2)
function stringx.count(s,sub,allow_overlap)
assert_string(1,s)
local _,k = _find_all(s,sub,1,false,allow_overlap)
return k
end
--- Stripping and Justifying
-- @section strip
local function _just(s,w,ch,left,right)
local n = #s
if w > n then
if not ch then ch = ' ' end
local f1,f2
if left and right then
local rn = ceil((w-n)/2)
local ln = w - n - rn
f1 = rep(ch,ln)
f2 = rep(ch,rn)
elseif right then
f1 = rep(ch,w-n)
f2 = ''
else
f2 = rep(ch,w-n)
f1 = ''
end
return f1..s..f2
else
return s
end
end
--- left-justify s with width w.
-- @string s the string
-- @int w width of justification
-- @string[opt=' '] ch padding character
-- @usage stringx.ljust('hello', 10, '*') == '*****hello'
function stringx.ljust(s,w,ch)
assert_string(1,s)
assert_arg(2,w,'number')
return _just(s,w,ch,true,false)
end
--- right-justify s with width w.
-- @string s the string
-- @int w width of justification
-- @string[opt=' '] ch padding character
-- @usage stringx.rjust('hello', 10, '*') == 'hello*****'
function stringx.rjust(s,w,ch)
assert_string(1,s)
assert_arg(2,w,'number')
return _just(s,w,ch,false,true)
end
--- center-justify s with width w.
-- @string s the string
-- @int w width of justification
-- @string[opt=' '] ch padding character
-- @usage stringx.center('hello', 10, '*') == '**hello***'
function stringx.center(s,w,ch)
assert_string(1,s)
assert_arg(2,w,'number')
return _just(s,w,ch,true,true)
end
local function _strip(s,left,right,chrs)
if not chrs then
chrs = '%s'
else
chrs = '['..escape(chrs)..']'
end
local f = 1
local t
if left then
local i1,i2 = find(s,'^'..chrs..'*')
if i2 >= i1 then
f = i2+1
end
end
if right then
if #s < 200 then
local i1,i2 = find(s,chrs..'*$',f)
if i2 >= i1 then
t = i1-1
end
else
local rs = reverse(s)
local i1,i2 = find(rs, '^'..chrs..'*')
if i2 >= i1 then
t = -i2-1
end
end
end
return sub(s,f,t)
end
--- trim any characters on the left of s.
-- @string s the string
-- @string[opt='%s'] chrs default any whitespace character,
-- but can be a string of characters to be trimmed
function stringx.lstrip(s,chrs)
assert_string(1,s)
return _strip(s,true,false,chrs)
end
lstrip = stringx.lstrip
--- trim any characters on the right of s.
-- @string s the string
-- @string[opt='%s'] chrs default any whitespace character,
-- but can be a string of characters to be trimmed
function stringx.rstrip(s,chrs)
assert_string(1,s)
return _strip(s,false,true,chrs)
end
--- trim any characters on both left and right of s.
-- @string s the string
-- @string[opt='%s'] chrs default any whitespace character,
-- but can be a string of characters to be trimmed
-- @usage stringx.strip(' --== Hello ==-- ', "- =") --> 'Hello'
function stringx.strip(s,chrs)
assert_string(1,s)
return _strip(s,true,true,chrs)
end
--- Partitioning Strings
-- @section partitioning
--- split a string using a pattern. Note that at least one value will be returned!
-- @string s the string
-- @string[opt='%s'] re a Lua string pattern (defaults to whitespace)
-- @return the parts of the string
-- @usage a,b = line:splitv('=')
-- @see utils.splitv
function stringx.splitv(s,re)
assert_string(1,s)
return utils.splitv(s,re)
end
-- The partition functions split a string using a delimiter into three parts:
-- the part before, the delimiter itself, and the part afterwards
local function _partition(p,delim,fn)
local i1,i2 = fn(p,delim)
if not i1 or i1 == -1 then
return p,'',''
else
if not i2 then i2 = i1 end
return sub(p,1,i1-1),sub(p,i1,i2),sub(p,i2+1)
end
end
--- partition the string using first occurance of a delimiter
-- @string s the string
-- @string ch delimiter (match as plain string, no patterns)
-- @return part before ch
-- @return ch
-- @return part after ch
-- @usage {stringx.partition('a,b,c', ','))} == {'a', ',', 'b,c'}
-- @usage {stringx.partition('abc', 'x'))} == {'abc', '', ''}
function stringx.partition(s,ch)
assert_string(1,s)
assert_nonempty_string(2,ch)
return _partition(s,ch,stringx.lfind)
end
--- partition the string p using last occurance of a delimiter
-- @string s the string
-- @string ch delimiter (match as plain string, no patterns)
-- @return part before ch
-- @return ch
-- @return part after ch
-- @usage {stringx.rpartition('a,b,c', ','))} == {'a,b', ',', 'c'}
-- @usage {stringx.rpartition('abc', 'x'))} == {'', '', 'abc'}
function stringx.rpartition(s,ch)
assert_string(1,s)
assert_nonempty_string(2,ch)
local a,b,c = _partition(s,ch,stringx.rfind)
if a == s then -- no match found
return c,b,a
end
return a,b,c
end
--- return the 'character' at the index.
-- @string s the string
-- @int idx an index (can be negative)
-- @return a substring of length 1 if successful, empty string otherwise.
function stringx.at(s,idx)
assert_string(1,s)
assert_arg(2,idx,'number')
return sub(s,idx,idx)
end
--- Text handling
-- @section text
--- indent a multiline string.
-- @tparam string s the (multiline) string
-- @tparam integer n the size of the indent
-- @tparam[opt=' '] string ch the character to use when indenting
-- @return indented string
function stringx.indent (s,n,ch)
assert_arg(1,s,'string')
assert_arg(2,n,'number')
local lines = usplit(s ,'\n')
local prefix = string.rep(ch or ' ',n)
for i, line in ipairs(lines) do
lines[i] = prefix..line
end
return concat(lines,'\n')..'\n'
end
--- dedent a multiline string by removing any initial indent.
-- useful when working with [[..]] strings.
-- Empty lines are ignored.
-- @tparam string s the (multiline) string
-- @return a string with initial indent zero.
-- @usage
-- local s = dedent [[
-- One
--
-- Two
--
-- Three
-- ]]
-- assert(s == [[
-- One
--
-- Two
--
-- Three
-- ]])
function stringx.dedent (s)
assert_arg(1,s,'string')
local lst = usplit(s,'\n')
if #lst>0 then
local ind_size = math.huge
for i, line in ipairs(lst) do
local i1, i2 = lst[i]:find('^%s*[^%s]')
if i1 and i2 < ind_size then
ind_size = i2
end
end
for i, line in ipairs(lst) do
lst[i] = lst[i]:sub(ind_size, -1)
end
end
return concat(lst,'\n')..'\n'
end
do
local buildline = function(words, size, breaklong)
-- if overflow is set, a word longer than size, will overflow the size
-- otherwise it will be chopped in line-length pieces
local line = {}
if #words[1] > size then
-- word longer than line
if not breaklong then
line[1] = words[1]
remove(words, 1)
else
line[1] = words[1]:sub(1, size)
words[1] = words[1]:sub(size + 1, -1)
end
else
local len = 0
while words[1] and (len + #words[1] <= size) or
(len == 0 and #words[1] == size) do
if words[1] ~= "" then
line[#line+1] = words[1]
len = len + #words[1] + 1
end
remove(words, 1)
end
end
return stringx.strip(concat(line, " ")), words
end
--- format a paragraph into lines so that they fit into a line width.
-- It will not break long words by default, so lines can be over the length
-- to that extent.
-- @tparam string s the string to format
-- @tparam[opt=70] integer width the margin width
-- @tparam[opt=false] boolean breaklong if truthy, words longer than the width given will be forced split.
-- @return a list of lines (List object), use `fill` to return a string instead of a `List`.
-- @see pl.List
-- @see fill
stringx.wrap = function(s, width, breaklong)
s = s:gsub('\n',' ') -- remove line breaks
s = stringx.strip(s) -- remove leading/trailing whitespace
if s == "" then
return { "" }
end
width = width or 70
local out = {}
local words = usplit(s, "%s")
while words[1] do
out[#out+1], words = buildline(words, width, breaklong)
end
return makelist(out)
end
end
--- format a paragraph so that it fits into a line width.
-- @tparam string s the string to format
-- @tparam[opt=70] integer width the margin width
-- @tparam[opt=false] boolean breaklong if truthy, words longer than the width given will be forced split.
-- @return a string, use `wrap` to return a list of lines instead of a string.
-- @see wrap
function stringx.fill (s,width,breaklong)
return concat(stringx.wrap(s,width,breaklong),'\n') .. '\n'
end
--- Template
-- @section Template
local function _substitute(s,tbl,safe)
local subst
if is_callable(tbl) then
subst = tbl
else
function subst(f)
local s = tbl[f]
if not s then
if safe then
return f
else
error("not present in table "..f)
end
else
return s
end
end
end
local res = gsub(s,'%${([%w_]+)}',subst)
return (gsub(res,'%$([%w_]+)',subst))
end
local Template = {}
stringx.Template = Template
Template.__index = Template
setmetatable(Template, {
__call = function(obj,tmpl)
return Template.new(tmpl)
end
})
--- Creates a new Template class.
-- This is a shortcut to `Template.new(tmpl)`.
-- @tparam string tmpl the template string
-- @function Template
-- @treturn Template
function Template.new(tmpl)
assert_arg(1,tmpl,'string')
local res = {}
res.tmpl = tmpl
setmetatable(res,Template)
return res
end
--- substitute values into a template, throwing an error.
-- This will throw an error if no name is found.
-- @tparam table tbl a table of name-value pairs.
-- @return string with place holders substituted
function Template:substitute(tbl)
assert_arg(1,tbl,'table')
return _substitute(self.tmpl,tbl,false)
end
--- substitute values into a template.
-- This version just passes unknown names through.
-- @tparam table tbl a table of name-value pairs.
-- @return string with place holders substituted
function Template:safe_substitute(tbl)
assert_arg(1,tbl,'table')
return _substitute(self.tmpl,tbl,true)
end
--- substitute values into a template, preserving indentation. <br>
-- If the value is a multiline string _or_ a template, it will insert
-- the lines at the correct indentation. <br>
-- Furthermore, if a template, then that template will be substituted
-- using the same table.
-- @tparam table tbl a table of name-value pairs.
-- @return string with place holders substituted
function Template:indent_substitute(tbl)
assert_arg(1,tbl,'table')
if not self.strings then
self.strings = usplit(self.tmpl,'\n')
end
-- the idea is to substitute line by line, grabbing any spaces as
-- well as the $var. If the value to be substituted contains newlines,
-- then we split that into lines and adjust the indent before inserting.
local function subst(line)
return line:gsub('(%s*)%$([%w_]+)',function(sp,f)
local subtmpl
local s = tbl[f]
if not s then error("not present in table "..f) end
if getmetatable(s) == Template then
subtmpl = s
s = s.tmpl
else
s = tostring(s)
end
if s:find '\n' then
local lines = usplit(s, '\n')
for i, line in ipairs(lines) do
lines[i] = sp..line
end
s = concat(lines, '\n') .. '\n'
end
if subtmpl then
return _substitute(s, tbl)
else
return s
end
end)
end
local lines = {}
for i, line in ipairs(self.strings) do
lines[i] = subst(line)
end
return concat(lines,'\n')..'\n'
end
--- Miscelaneous
-- @section misc
--- return an iterator over all lines in a string
-- @string s the string
-- @return an iterator
-- @usage
-- local line_no = 1
-- for line in stringx.lines(some_text) do
-- print(line_no, line)
-- line_no = line_no + 1
-- end
function stringx.lines(s)
assert_string(1,s)
if not s:find '\n$' then s = s..'\n' end
return s:gmatch('([^\n]*)\n')
end
--- inital word letters uppercase ('title case').
-- Here 'words' mean chunks of non-space characters.
-- @string s the string
-- @return a string with each word's first letter uppercase
-- @usage stringx.title("hello world") == "Hello World")
function stringx.title(s)
assert_string(1,s)
return (s:gsub('(%S)(%S*)',function(f,r)
return f:upper()..r:lower()
end))
end
stringx.capitalize = stringx.title
do
local ellipsis = '...'
local n_ellipsis = #ellipsis
--- Return a shortened version of a string.
-- Fits string within w characters. Removed characters are marked with ellipsis.
-- @string s the string
-- @int w the maxinum size allowed
-- @bool tail true if we want to show the end of the string (head otherwise)
-- @usage ('1234567890'):shorten(8) == '12345...'
-- @usage ('1234567890'):shorten(8, true) == '...67890'
-- @usage ('1234567890'):shorten(20) == '1234567890'
function stringx.shorten(s,w,tail)
assert_string(1,s)
if #s > w then
if w < n_ellipsis then return ellipsis:sub(1,w) end
if tail then
local i = #s - w + 1 + n_ellipsis
return ellipsis .. s:sub(i)
else
return s:sub(1,w-n_ellipsis) .. ellipsis
end
end
return s
end
end
do
-- Utility function that finds any patterns that match a long string's an open or close.
-- Note that having this function use the least number of equal signs that is possible is a harder algorithm to come up with.
-- Right now, it simply returns the greatest number of them found.
-- @param s The string
-- @return 'nil' if not found. If found, the maximum number of equal signs found within all matches.
local function has_lquote(s)
local lstring_pat = '([%[%]])(=*)%1'
local equals, new_equals, _
local finish = 1
repeat
_, finish, _, new_equals = s:find(lstring_pat, finish)
if new_equals then
equals = max(equals or 0, #new_equals)
end
until not new_equals
return equals
end
--- Quote the given string and preserve any control or escape characters, such that reloading the string in Lua returns the same result.
-- @param s The string to be quoted.
-- @return The quoted string.
function stringx.quote_string(s)
assert_string(1,s)
-- Find out if there are any embedded long-quote sequences that may cause issues.
-- This is important when strings are embedded within strings, like when serializing.
-- Append a closing bracket to catch unfinished long-quote sequences at the end of the string.
local equal_signs = has_lquote(s .. "]")
-- Note that strings containing "\r" can't be quoted using long brackets
-- as Lua lexer converts all newlines to "\n" within long strings.
if (s:find("\n") or equal_signs) and not s:find("\r") then
-- If there is an embedded sequence that matches a long quote, then
-- find the one with the maximum number of = signs and add one to that number.
equal_signs = ("="):rep((equal_signs or -1) + 1)
-- Long strings strip out leading newline. We want to retain that, when quoting.
if s:find("^\n") then s = "\n" .. s end
local lbracket, rbracket =
"[" .. equal_signs .. "[",
"]" .. equal_signs .. "]"
s = lbracket .. s .. rbracket
else
-- Escape funny stuff. Lua 5.1 does not handle "\r" correctly.
s = ("%q"):format(s):gsub("\r", "\\r")
end
return s
end
end
--- Python-style formatting operator.
-- Calling `text.format_operator()` overloads the % operator for strings to give
-- Python/Ruby style formated output.
-- This is extended to also do template-like substitution for map-like data.
--
-- Note this goes further than the original, and will allow these cases:
--
-- 1. a single value
-- 2. a list of values
-- 3. a map of var=value pairs
-- 4. a function, as in gsub
--
-- For the second two cases, it uses $-variable substituion.
--
-- When called, this function will monkey-patch the global `string` metatable by
-- adding a `__mod` method.
--
-- See <a href="http://lua-users.org/wiki/StringInterpolation">the lua-users wiki</a>
--
-- @usage
-- require 'pl.text'.format_operator()
-- local out1 = '%s = %5.3f' % {'PI',math.pi} --> 'PI = 3.142'
-- local out2 = '$name = $value' % {name='dog',value='Pluto'} --> 'dog = Pluto'
function stringx.format_operator()
local format = string.format
-- a more forgiving version of string.format, which applies
-- tostring() to any value with a %s format.
local function formatx (fmt,...)
local args = pack(...)
local i = 1
for p in fmt:gmatch('%%.') do
if p == '%s' and type(args[i]) ~= 'string' then
args[i] = tostring(args[i])
end
i = i + 1
end
return format(fmt,unpack(args))
end
local function basic_subst(s,t)
return (s:gsub('%$([%w_]+)',t))
end
getmetatable("").__mod = function(a, b)
if b == nil then
return a
elseif type(b) == "table" and getmetatable(b) == nil then
if #b == 0 then -- assume a map-like table
return _substitute(a,b,true)
else
return formatx(a,unpack(b))
end
elseif type(b) == 'function' then
return basic_subst(a,b)
else
return formatx(a,b)
end
end
end
--- import the stringx functions into the global string (meta)table
function stringx.import()
utils.import(stringx,string)
end
return stringx