mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-04-28 13:28:01 +03:00
Wrote a simplified regular expression parser to make life easier (it only does single matches; i.e., no one-or-more matches, etc.).
Fixed some of the whitespace/line break matching.
This commit is contained in:
parent
873dbc2421
commit
4e435b1321
7 changed files with 277 additions and 75 deletions
17
main.cpp
17
main.cpp
|
@ -1,7 +1,24 @@
|
|||
#include "document.h"
|
||||
#include "regex.h"
|
||||
|
||||
int main()
|
||||
{
|
||||
YAML::RegEx alpha = YAML::RegEx('a', 'z') || YAML::RegEx('A', 'Z');
|
||||
alpha.Matches("a");
|
||||
alpha.Matches("d");
|
||||
alpha.Matches("F");
|
||||
alpha.Matches("0");
|
||||
alpha.Matches("5");
|
||||
alpha.Matches(" ");
|
||||
|
||||
YAML::RegEx blank = YAML::RegEx(' ') || YAML::RegEx('\t');
|
||||
YAML::RegEx docstart = YAML::RegEx("---") + (blank || YAML::RegEx(EOF) || YAML::RegEx());
|
||||
docstart.Matches("--- ");
|
||||
docstart.Matches("... ");
|
||||
docstart.Matches("----");
|
||||
docstart.Matches("---\t");
|
||||
docstart.Matches("---");
|
||||
|
||||
YAML::Document doc("test.yaml");
|
||||
|
||||
return 0;
|
||||
|
|
114
regex.cpp
Normal file
114
regex.cpp
Normal file
|
@ -0,0 +1,114 @@
|
|||
#include "regex.h"
|
||||
|
||||
namespace YAML
|
||||
{
|
||||
RegEx::RegEx(REGEX_OP op): m_op(op)
|
||||
{
|
||||
}
|
||||
|
||||
RegEx::RegEx(): m_op(REGEX_EMPTY)
|
||||
{
|
||||
}
|
||||
|
||||
RegEx::RegEx(char ch): m_op(REGEX_MATCH), m_a(ch)
|
||||
{
|
||||
}
|
||||
|
||||
RegEx::RegEx(char a, char z): m_op(REGEX_RANGE), m_a(a), m_z(z)
|
||||
{
|
||||
}
|
||||
|
||||
RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op)
|
||||
{
|
||||
for(unsigned i=0;i<str.size();i++)
|
||||
m_params.push_back(RegEx(str[0]));
|
||||
}
|
||||
|
||||
RegEx::~RegEx()
|
||||
{
|
||||
}
|
||||
|
||||
bool RegEx::Matches(char ch) const
|
||||
{
|
||||
std::string str;
|
||||
str += ch;
|
||||
return Matches(str);
|
||||
}
|
||||
|
||||
bool RegEx::Matches(const std::string& str) const
|
||||
{
|
||||
return Match(str) >= 0;
|
||||
}
|
||||
|
||||
// Match
|
||||
// . Matches the given string against this regular expression.
|
||||
// . Returns the number of characters matched.
|
||||
// . Returns -1 if no characters were matched (the reason for
|
||||
// not returning zero is that we may have an empty regex
|
||||
// which SHOULD be considered successfully matching nothing,
|
||||
// but that of course matches zero characters).
|
||||
int RegEx::Match(const std::string& str) const
|
||||
{
|
||||
switch(m_op) {
|
||||
case REGEX_EMPTY:
|
||||
if(str.empty())
|
||||
return 0;
|
||||
return -1;
|
||||
case REGEX_MATCH:
|
||||
if(str.empty() || str[0] != m_a)
|
||||
return -1;
|
||||
return 1;
|
||||
case REGEX_RANGE:
|
||||
if(str.empty() || m_a > str[0] || m_z < str[0])
|
||||
return -1;
|
||||
return 1;
|
||||
case REGEX_NOT:
|
||||
if(m_params.empty())
|
||||
return false;
|
||||
if(m_params[0].Match(str) >= 0)
|
||||
return -1;
|
||||
return 1;
|
||||
case REGEX_OR:
|
||||
for(unsigned i=0;i<m_params.size();i++) {
|
||||
int n = m_params[i].Match(str);
|
||||
if(n >= 0)
|
||||
return n;
|
||||
}
|
||||
return -1;
|
||||
case REGEX_SEQ:
|
||||
int offset = 0;
|
||||
for(unsigned i=0;i<m_params.size();i++) {
|
||||
int n = m_params[i].Match(str.substr(offset));
|
||||
if(n == -1)
|
||||
return -1;
|
||||
offset += n;
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
RegEx operator ! (const RegEx& ex)
|
||||
{
|
||||
RegEx ret(REGEX_NOT);
|
||||
ret.m_params.push_back(ex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
RegEx operator || (const RegEx& ex1, const RegEx& ex2)
|
||||
{
|
||||
RegEx ret(REGEX_OR);
|
||||
ret.m_params.push_back(ex1);
|
||||
ret.m_params.push_back(ex2);
|
||||
return ret;
|
||||
}
|
||||
|
||||
RegEx operator + (const RegEx& ex1, const RegEx& ex2)
|
||||
{
|
||||
RegEx ret(REGEX_SEQ);
|
||||
ret.m_params.push_back(ex1);
|
||||
ret.m_params.push_back(ex2);
|
||||
return ret;
|
||||
}
|
||||
}
|
37
regex.h
Normal file
37
regex.h
Normal file
|
@ -0,0 +1,37 @@
|
|||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
namespace YAML
|
||||
{
|
||||
enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_NOT, REGEX_SEQ };
|
||||
|
||||
// simplified regular expressions
|
||||
// . Only straightforward matches (no repeated characters)
|
||||
// . Only matches from start of string
|
||||
class RegEx {
|
||||
public:
|
||||
RegEx();
|
||||
RegEx(char ch);
|
||||
RegEx(char a, char z);
|
||||
RegEx(const std::string& str, REGEX_OP op = REGEX_SEQ);
|
||||
~RegEx();
|
||||
|
||||
bool Matches(char ch) const;
|
||||
bool Matches(const std::string& str) const;
|
||||
int Match(const std::string& str) const;
|
||||
|
||||
friend RegEx operator ! (const RegEx& ex);
|
||||
friend RegEx operator || (const RegEx& ex1, const RegEx& ex2);
|
||||
friend RegEx operator + (const RegEx& ex1, const RegEx& ex2);
|
||||
|
||||
private:
|
||||
RegEx(REGEX_OP op);
|
||||
|
||||
private:
|
||||
REGEX_OP m_op;
|
||||
char m_a, m_z;
|
||||
std::vector <RegEx> m_params;
|
||||
};
|
||||
}
|
133
scanner.cpp
133
scanner.cpp
|
@ -30,7 +30,10 @@ namespace YAML
|
|||
char Scanner::GetChar()
|
||||
{
|
||||
m_column++;
|
||||
return INPUT.get();
|
||||
char ch = INPUT.get();
|
||||
if(ch == '\n')
|
||||
m_column = 0;
|
||||
return ch;
|
||||
}
|
||||
|
||||
// Eat
|
||||
|
@ -87,18 +90,6 @@ namespace YAML
|
|||
return false;
|
||||
}
|
||||
|
||||
// IsLineBreak
|
||||
bool Scanner::IsLineBreak(char ch)
|
||||
{
|
||||
return ch == '\n'; // TODO: More types of line breaks
|
||||
}
|
||||
|
||||
// IsBlank
|
||||
bool Scanner::IsBlank(char ch)
|
||||
{
|
||||
return IsLineBreak(ch) || ch == ' ' || ch == '\t' || ch == EOF;
|
||||
}
|
||||
|
||||
// IsDocumentStart
|
||||
bool Scanner::IsDocumentStart()
|
||||
{
|
||||
|
@ -106,8 +97,7 @@ namespace YAML
|
|||
if(m_column != 0)
|
||||
return false;
|
||||
|
||||
std::string next = Peek(4);
|
||||
return next[0] == '-' && next[1] == '-' && next[2] == '-' && IsBlank(next[3]);
|
||||
return Exp::DocStart.Matches(Peek(4));
|
||||
}
|
||||
|
||||
// IsDocumentEnd
|
||||
|
@ -117,61 +107,41 @@ namespace YAML
|
|||
if(m_column != 0)
|
||||
return false;
|
||||
|
||||
std::string next = Peek(4);
|
||||
return next[0] == '.' && next[1] == '.' && next[2] == '.' && IsBlank(next[3]);
|
||||
return Exp::DocEnd.Matches(Peek(4));
|
||||
}
|
||||
|
||||
// IsBlockEntry
|
||||
bool Scanner::IsBlockEntry()
|
||||
{
|
||||
std::string next = Peek(2);
|
||||
return next[0] == Keys::BlockEntry && IsBlank(next[1]);
|
||||
return Exp::BlockEntry.Matches(Peek(2));
|
||||
}
|
||||
|
||||
// IsKey
|
||||
bool Scanner::IsKey()
|
||||
{
|
||||
std::string next = Peek(2);
|
||||
return next[0] == Keys::Key && (IsBlank(next[1]) || m_flowLevel > 0);
|
||||
if(m_flowLevel > 0)
|
||||
return Exp::KeyInFlow.Matches(next);
|
||||
return Exp::Key.Matches(next);
|
||||
}
|
||||
|
||||
// IsValue
|
||||
bool Scanner::IsValue()
|
||||
{
|
||||
std::string next = Peek(2);
|
||||
return next[0] == Keys::Value && (IsBlank(next[1]) || m_flowLevel > 0);
|
||||
if(m_flowLevel > 0)
|
||||
return Exp::ValueInFlow.Matches(next);
|
||||
return Exp::Value.Matches(next);
|
||||
}
|
||||
|
||||
// IsPlainScalar
|
||||
// . Rules:
|
||||
// . Cannot start with a blank.
|
||||
// . Can never start with any of , [ ] { } # & * ! | > \' \" % @ `
|
||||
// . In the block context - ? : must be not be followed with a space.
|
||||
// . In the flow context ? : are illegal and - must not be followed with a space.
|
||||
bool Scanner::IsPlainScalar()
|
||||
{
|
||||
std::string next = Peek(2);
|
||||
|
||||
if(IsBlank(next[0]))
|
||||
return false;
|
||||
|
||||
// never characters
|
||||
if(std::string(",[]{}#&*!|>\'\"%@`").find(next[0]) != std::string::npos)
|
||||
return false;
|
||||
|
||||
// specific block/flow characters
|
||||
if(m_flowLevel == 0) {
|
||||
if((next[0] == '-' || next[0] == '?' || next[0] == ':') && IsBlank(next[1]))
|
||||
return false;
|
||||
} else {
|
||||
if(next[0] == '?' || next[0] == ':')
|
||||
return false;
|
||||
|
||||
if(next[0] == '-' && IsBlank(next[1]))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
if(m_flowLevel > 0)
|
||||
return Exp::PlainScalarInFlow.Matches(next);
|
||||
return Exp::PlainScalar.Matches(next);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
@ -233,7 +203,7 @@ namespace YAML
|
|||
// DocumentEndToken
|
||||
template <> DocumentEndToken *Scanner::ScanToken(DocumentEndToken *pToken)
|
||||
{
|
||||
PopIndentTo(m_column);
|
||||
PopIndentTo(-1);
|
||||
// TODO: "reset simple keys"
|
||||
|
||||
m_simpleKeyAllowed = false;
|
||||
|
@ -389,8 +359,8 @@ namespace YAML
|
|||
m_simpleKeyAllowed = false;
|
||||
|
||||
// now eat and store the scalar
|
||||
std::string scalar;
|
||||
bool leadingBlanks = true;
|
||||
std::string scalar, whitespace, leadingBreaks, trailingBreaks;
|
||||
bool leadingBlanks = false;
|
||||
|
||||
while(INPUT) {
|
||||
// doc start/end tokens
|
||||
|
@ -398,43 +368,72 @@ namespace YAML
|
|||
break;
|
||||
|
||||
// comment
|
||||
if(INPUT.peek() == Keys::Comment)
|
||||
if(Exp::Comment.Matches(INPUT.peek()))
|
||||
break;
|
||||
|
||||
// first eat non-blanks
|
||||
while(INPUT && !IsBlank(INPUT.peek())) {
|
||||
while(INPUT && !Exp::BlankOrBreak.Matches(INPUT.peek())) {
|
||||
std::string next = Peek(2);
|
||||
|
||||
// illegal colon in flow context
|
||||
if(m_flowLevel > 0 && next[0] == ':') {
|
||||
if(!IsBlank(next[1]))
|
||||
throw IllegalScalar();
|
||||
}
|
||||
if(m_flowLevel > 0 && Exp::IllegalColonInScalar.Matches(next))
|
||||
throw IllegalScalar();
|
||||
|
||||
// characters that might end the scalar
|
||||
if(next[0] == ':' && IsBlank(next[1]))
|
||||
if(m_flowLevel > 0 && Exp::EndScalarInFlow.Matches(next))
|
||||
break;
|
||||
if(m_flowLevel > 0 && std::string(",:?[]{}").find(next[0]) != std::string::npos)
|
||||
if(m_flowLevel == 0 && Exp::EndScalar.Matches(next))
|
||||
break;
|
||||
|
||||
if(leadingBlanks) {
|
||||
if(!leadingBreaks.empty() && leadingBreaks[0] == '\n') {
|
||||
// fold line break?
|
||||
if(trailingBreaks.empty())
|
||||
scalar += ' ';
|
||||
else {
|
||||
scalar += trailingBreaks;
|
||||
trailingBreaks = "";
|
||||
}
|
||||
} else {
|
||||
scalar += leadingBreaks + trailingBreaks;
|
||||
leadingBreaks = "";
|
||||
trailingBreaks = "";
|
||||
}
|
||||
} else if(!whitespace.empty()) {
|
||||
scalar += whitespace;
|
||||
whitespace = "";
|
||||
}
|
||||
|
||||
// finally, read the character!
|
||||
scalar += GetChar();
|
||||
}
|
||||
|
||||
// did we hit a non-blank character that ended us?
|
||||
if(!Exp::BlankOrBreak.Matches(INPUT.peek()))
|
||||
break;
|
||||
|
||||
// now eat blanks
|
||||
while(INPUT && (IsBlank(INPUT.peek()) /* || IsBreak(INPUT.peek()) */)) {
|
||||
if(IsBlank(INPUT.peek())) {
|
||||
while(INPUT && Exp::BlankOrBreak.Matches(INPUT.peek())) {
|
||||
if(Exp::Blank.Matches(INPUT.peek())) {
|
||||
if(leadingBlanks && m_column <= m_indents.top())
|
||||
throw IllegalTabInScalar();
|
||||
|
||||
// TODO: Store some blanks?
|
||||
Eat(1);
|
||||
// maybe store this character
|
||||
if(!leadingBlanks)
|
||||
whitespace += GetChar();
|
||||
else
|
||||
Eat(1);
|
||||
} else {
|
||||
Eat(1);
|
||||
// where to store this character?
|
||||
if(!leadingBlanks) {
|
||||
leadingBlanks = true;
|
||||
whitespace = "";
|
||||
leadingBreaks += GetChar();
|
||||
} else
|
||||
trailingBreaks += GetChar();
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: join whitespace
|
||||
|
||||
// and finally break if we're below the indentation level
|
||||
if(m_flowLevel == 0 && m_column <= m_indents.top())
|
||||
break;
|
||||
|
@ -532,14 +531,14 @@ namespace YAML
|
|||
Eat(1);
|
||||
|
||||
// then eat a comment
|
||||
if(INPUT.peek() == Keys::Comment) {
|
||||
if(Exp::Comment.Matches(INPUT.peek())) {
|
||||
// eat until line break
|
||||
while(INPUT && !IsLineBreak(INPUT.peek()))
|
||||
while(INPUT && !Exp::Break.Matches(INPUT.peek()))
|
||||
Eat(1);
|
||||
}
|
||||
|
||||
// if it's NOT a line break, then we're done!
|
||||
if(!IsLineBreak(INPUT.peek()))
|
||||
if(!Exp::Break.Matches(INPUT.peek()))
|
||||
break;
|
||||
|
||||
// otherwise, let's eat the line break and keep going
|
||||
|
|
37
scanner.h
37
scanner.h
|
@ -5,22 +5,49 @@
|
|||
#include <queue>
|
||||
#include <stack>
|
||||
#include <set>
|
||||
#include "regex.h"
|
||||
|
||||
namespace YAML
|
||||
{
|
||||
class Token;
|
||||
|
||||
namespace Exp
|
||||
{
|
||||
// misc
|
||||
const RegEx Blank = RegEx(' ') || RegEx('\t');
|
||||
const RegEx Break = RegEx('\n');
|
||||
const RegEx BlankOrBreak = Blank || Break;
|
||||
|
||||
// actual tags
|
||||
|
||||
const RegEx DocStart = RegEx("---") + (BlankOrBreak || RegEx(EOF) || RegEx());
|
||||
const RegEx DocEnd = RegEx("...") + (BlankOrBreak || RegEx(EOF) || RegEx());
|
||||
const RegEx BlockEntry = RegEx('-') + (BlankOrBreak || RegEx(EOF));
|
||||
const RegEx Key = RegEx('?'),
|
||||
KeyInFlow = RegEx('?') + BlankOrBreak;
|
||||
const RegEx Value = RegEx(':'),
|
||||
ValueInFlow = RegEx(':') + BlankOrBreak;
|
||||
const RegEx Comment = RegEx('#');
|
||||
|
||||
// Plain scalar rules:
|
||||
// . Cannot start with a blank.
|
||||
// . Can never start with any of , [ ] { } # & * ! | > \' \" % @ `
|
||||
// . In the block context - ? : must be not be followed with a space.
|
||||
// . In the flow context ? : are illegal and - must not be followed with a space.
|
||||
const RegEx PlainScalar = !(BlankOrBreak || RegEx(",[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx("-?:") + Blank)),
|
||||
PlainScalarInFlow = !(BlankOrBreak || RegEx("?:,[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx('-') + Blank));
|
||||
const RegEx IllegalColonInScalar = RegEx(':') + !BlankOrBreak;
|
||||
const RegEx EndScalar = RegEx(':') + BlankOrBreak,
|
||||
EndScalarInFlow = (RegEx(':') + BlankOrBreak) || RegEx(",:?[]{}");
|
||||
}
|
||||
|
||||
namespace Keys
|
||||
{
|
||||
const char Comment = '#';
|
||||
const char FlowSeqStart = '[';
|
||||
const char FlowSeqEnd = ']';
|
||||
const char FlowMapStart = '{';
|
||||
const char FlowMapEnd = '}';
|
||||
const char FlowEntry = ',';
|
||||
const char BlockEntry = '-';
|
||||
const char Key = '?';
|
||||
const char Value = ':';
|
||||
const char Alias = '*';
|
||||
const char Anchor = '&';
|
||||
const char Tag = '!';
|
||||
|
@ -49,8 +76,6 @@ namespace YAML
|
|||
void EatLineBreak();
|
||||
|
||||
bool IsWhitespaceToBeEaten(char ch);
|
||||
bool IsLineBreak(char ch);
|
||||
bool IsBlank(char ch);
|
||||
bool IsDocumentStart();
|
||||
bool IsDocumentEnd();
|
||||
bool IsBlockEntry();
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
---
|
||||
- milk
|
||||
- eggs
|
||||
- cheese and bread # this is really important!
|
||||
- eggs # this is really important!
|
||||
- cheese and bread
|
||||
...
|
|
@ -185,6 +185,10 @@
|
|||
RelativePath=".\parser.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\regex.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\scalar.cpp"
|
||||
>
|
||||
|
@ -227,6 +231,10 @@
|
|||
RelativePath=".\parser.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\regex.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\scalar.h"
|
||||
>
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue