Wrote a simplified regular expression parser to make life easier (it only does single matches; i.e., no one-or-more matches, etc.).

Fixed some of the whitespace/line break matching.
This commit is contained in:
Jesse Beder 2008-06-27 08:20:41 +00:00
parent 873dbc2421
commit 4e435b1321
7 changed files with 277 additions and 75 deletions

View file

@ -1,7 +1,24 @@
#include "document.h"
#include "regex.h"
int main()
{
YAML::RegEx alpha = YAML::RegEx('a', 'z') || YAML::RegEx('A', 'Z');
alpha.Matches("a");
alpha.Matches("d");
alpha.Matches("F");
alpha.Matches("0");
alpha.Matches("5");
alpha.Matches(" ");
YAML::RegEx blank = YAML::RegEx(' ') || YAML::RegEx('\t');
YAML::RegEx docstart = YAML::RegEx("---") + (blank || YAML::RegEx(EOF) || YAML::RegEx());
docstart.Matches("--- ");
docstart.Matches("... ");
docstart.Matches("----");
docstart.Matches("---\t");
docstart.Matches("---");
YAML::Document doc("test.yaml");
return 0;

114
regex.cpp Normal file
View file

@ -0,0 +1,114 @@
#include "regex.h"
namespace YAML
{
RegEx::RegEx(REGEX_OP op): m_op(op)
{
}
RegEx::RegEx(): m_op(REGEX_EMPTY)
{
}
RegEx::RegEx(char ch): m_op(REGEX_MATCH), m_a(ch)
{
}
RegEx::RegEx(char a, char z): m_op(REGEX_RANGE), m_a(a), m_z(z)
{
}
RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op)
{
for(unsigned i=0;i<str.size();i++)
m_params.push_back(RegEx(str[0]));
}
RegEx::~RegEx()
{
}
bool RegEx::Matches(char ch) const
{
std::string str;
str += ch;
return Matches(str);
}
bool RegEx::Matches(const std::string& str) const
{
return Match(str) >= 0;
}
// Match
// . Matches the given string against this regular expression.
// . Returns the number of characters matched.
// . Returns -1 if no characters were matched (the reason for
// not returning zero is that we may have an empty regex
// which SHOULD be considered successfully matching nothing,
// but that of course matches zero characters).
int RegEx::Match(const std::string& str) const
{
switch(m_op) {
case REGEX_EMPTY:
if(str.empty())
return 0;
return -1;
case REGEX_MATCH:
if(str.empty() || str[0] != m_a)
return -1;
return 1;
case REGEX_RANGE:
if(str.empty() || m_a > str[0] || m_z < str[0])
return -1;
return 1;
case REGEX_NOT:
if(m_params.empty())
return false;
if(m_params[0].Match(str) >= 0)
return -1;
return 1;
case REGEX_OR:
for(unsigned i=0;i<m_params.size();i++) {
int n = m_params[i].Match(str);
if(n >= 0)
return n;
}
return -1;
case REGEX_SEQ:
int offset = 0;
for(unsigned i=0;i<m_params.size();i++) {
int n = m_params[i].Match(str.substr(offset));
if(n == -1)
return -1;
offset += n;
}
return offset;
}
return -1;
}
RegEx operator ! (const RegEx& ex)
{
RegEx ret(REGEX_NOT);
ret.m_params.push_back(ex);
return ret;
}
RegEx operator || (const RegEx& ex1, const RegEx& ex2)
{
RegEx ret(REGEX_OR);
ret.m_params.push_back(ex1);
ret.m_params.push_back(ex2);
return ret;
}
RegEx operator + (const RegEx& ex1, const RegEx& ex2)
{
RegEx ret(REGEX_SEQ);
ret.m_params.push_back(ex1);
ret.m_params.push_back(ex2);
return ret;
}
}

37
regex.h Normal file
View file

@ -0,0 +1,37 @@
#pragma once
#include <vector>
#include <string>
namespace YAML
{
enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_NOT, REGEX_SEQ };
// simplified regular expressions
// . Only straightforward matches (no repeated characters)
// . Only matches from start of string
class RegEx {
public:
RegEx();
RegEx(char ch);
RegEx(char a, char z);
RegEx(const std::string& str, REGEX_OP op = REGEX_SEQ);
~RegEx();
bool Matches(char ch) const;
bool Matches(const std::string& str) const;
int Match(const std::string& str) const;
friend RegEx operator ! (const RegEx& ex);
friend RegEx operator || (const RegEx& ex1, const RegEx& ex2);
friend RegEx operator + (const RegEx& ex1, const RegEx& ex2);
private:
RegEx(REGEX_OP op);
private:
REGEX_OP m_op;
char m_a, m_z;
std::vector <RegEx> m_params;
};
}

View file

@ -30,7 +30,10 @@ namespace YAML
char Scanner::GetChar()
{
m_column++;
return INPUT.get();
char ch = INPUT.get();
if(ch == '\n')
m_column = 0;
return ch;
}
// Eat
@ -87,18 +90,6 @@ namespace YAML
return false;
}
// IsLineBreak
bool Scanner::IsLineBreak(char ch)
{
return ch == '\n'; // TODO: More types of line breaks
}
// IsBlank
bool Scanner::IsBlank(char ch)
{
return IsLineBreak(ch) || ch == ' ' || ch == '\t' || ch == EOF;
}
// IsDocumentStart
bool Scanner::IsDocumentStart()
{
@ -106,8 +97,7 @@ namespace YAML
if(m_column != 0)
return false;
std::string next = Peek(4);
return next[0] == '-' && next[1] == '-' && next[2] == '-' && IsBlank(next[3]);
return Exp::DocStart.Matches(Peek(4));
}
// IsDocumentEnd
@ -117,61 +107,41 @@ namespace YAML
if(m_column != 0)
return false;
std::string next = Peek(4);
return next[0] == '.' && next[1] == '.' && next[2] == '.' && IsBlank(next[3]);
return Exp::DocEnd.Matches(Peek(4));
}
// IsBlockEntry
bool Scanner::IsBlockEntry()
{
std::string next = Peek(2);
return next[0] == Keys::BlockEntry && IsBlank(next[1]);
return Exp::BlockEntry.Matches(Peek(2));
}
// IsKey
bool Scanner::IsKey()
{
std::string next = Peek(2);
return next[0] == Keys::Key && (IsBlank(next[1]) || m_flowLevel > 0);
if(m_flowLevel > 0)
return Exp::KeyInFlow.Matches(next);
return Exp::Key.Matches(next);
}
// IsValue
bool Scanner::IsValue()
{
std::string next = Peek(2);
return next[0] == Keys::Value && (IsBlank(next[1]) || m_flowLevel > 0);
if(m_flowLevel > 0)
return Exp::ValueInFlow.Matches(next);
return Exp::Value.Matches(next);
}
// IsPlainScalar
// . Rules:
// . Cannot start with a blank.
// . Can never start with any of , [ ] { } # & * ! | > \' \" % @ `
// . In the block context - ? : must be not be followed with a space.
// . In the flow context ? : are illegal and - must not be followed with a space.
bool Scanner::IsPlainScalar()
{
std::string next = Peek(2);
if(IsBlank(next[0]))
return false;
// never characters
if(std::string(",[]{}#&*!|>\'\"%@`").find(next[0]) != std::string::npos)
return false;
// specific block/flow characters
if(m_flowLevel == 0) {
if((next[0] == '-' || next[0] == '?' || next[0] == ':') && IsBlank(next[1]))
return false;
} else {
if(next[0] == '?' || next[0] == ':')
return false;
if(next[0] == '-' && IsBlank(next[1]))
return false;
}
return true;
if(m_flowLevel > 0)
return Exp::PlainScalarInFlow.Matches(next);
return Exp::PlainScalar.Matches(next);
}
///////////////////////////////////////////////////////////////////////
@ -233,7 +203,7 @@ namespace YAML
// DocumentEndToken
template <> DocumentEndToken *Scanner::ScanToken(DocumentEndToken *pToken)
{
PopIndentTo(m_column);
PopIndentTo(-1);
// TODO: "reset simple keys"
m_simpleKeyAllowed = false;
@ -389,8 +359,8 @@ namespace YAML
m_simpleKeyAllowed = false;
// now eat and store the scalar
std::string scalar;
bool leadingBlanks = true;
std::string scalar, whitespace, leadingBreaks, trailingBreaks;
bool leadingBlanks = false;
while(INPUT) {
// doc start/end tokens
@ -398,43 +368,72 @@ namespace YAML
break;
// comment
if(INPUT.peek() == Keys::Comment)
if(Exp::Comment.Matches(INPUT.peek()))
break;
// first eat non-blanks
while(INPUT && !IsBlank(INPUT.peek())) {
while(INPUT && !Exp::BlankOrBreak.Matches(INPUT.peek())) {
std::string next = Peek(2);
// illegal colon in flow context
if(m_flowLevel > 0 && next[0] == ':') {
if(!IsBlank(next[1]))
throw IllegalScalar();
}
if(m_flowLevel > 0 && Exp::IllegalColonInScalar.Matches(next))
throw IllegalScalar();
// characters that might end the scalar
if(next[0] == ':' && IsBlank(next[1]))
if(m_flowLevel > 0 && Exp::EndScalarInFlow.Matches(next))
break;
if(m_flowLevel > 0 && std::string(",:?[]{}").find(next[0]) != std::string::npos)
if(m_flowLevel == 0 && Exp::EndScalar.Matches(next))
break;
if(leadingBlanks) {
if(!leadingBreaks.empty() && leadingBreaks[0] == '\n') {
// fold line break?
if(trailingBreaks.empty())
scalar += ' ';
else {
scalar += trailingBreaks;
trailingBreaks = "";
}
} else {
scalar += leadingBreaks + trailingBreaks;
leadingBreaks = "";
trailingBreaks = "";
}
} else if(!whitespace.empty()) {
scalar += whitespace;
whitespace = "";
}
// finally, read the character!
scalar += GetChar();
}
// did we hit a non-blank character that ended us?
if(!Exp::BlankOrBreak.Matches(INPUT.peek()))
break;
// now eat blanks
while(INPUT && (IsBlank(INPUT.peek()) /* || IsBreak(INPUT.peek()) */)) {
if(IsBlank(INPUT.peek())) {
while(INPUT && Exp::BlankOrBreak.Matches(INPUT.peek())) {
if(Exp::Blank.Matches(INPUT.peek())) {
if(leadingBlanks && m_column <= m_indents.top())
throw IllegalTabInScalar();
// TODO: Store some blanks?
Eat(1);
// maybe store this character
if(!leadingBlanks)
whitespace += GetChar();
else
Eat(1);
} else {
Eat(1);
// where to store this character?
if(!leadingBlanks) {
leadingBlanks = true;
whitespace = "";
leadingBreaks += GetChar();
} else
trailingBreaks += GetChar();
}
}
// TODO: join whitespace
// and finally break if we're below the indentation level
if(m_flowLevel == 0 && m_column <= m_indents.top())
break;
@ -532,14 +531,14 @@ namespace YAML
Eat(1);
// then eat a comment
if(INPUT.peek() == Keys::Comment) {
if(Exp::Comment.Matches(INPUT.peek())) {
// eat until line break
while(INPUT && !IsLineBreak(INPUT.peek()))
while(INPUT && !Exp::Break.Matches(INPUT.peek()))
Eat(1);
}
// if it's NOT a line break, then we're done!
if(!IsLineBreak(INPUT.peek()))
if(!Exp::Break.Matches(INPUT.peek()))
break;
// otherwise, let's eat the line break and keep going

View file

@ -5,22 +5,49 @@
#include <queue>
#include <stack>
#include <set>
#include "regex.h"
namespace YAML
{
class Token;
namespace Exp
{
// misc
const RegEx Blank = RegEx(' ') || RegEx('\t');
const RegEx Break = RegEx('\n');
const RegEx BlankOrBreak = Blank || Break;
// actual tags
const RegEx DocStart = RegEx("---") + (BlankOrBreak || RegEx(EOF) || RegEx());
const RegEx DocEnd = RegEx("...") + (BlankOrBreak || RegEx(EOF) || RegEx());
const RegEx BlockEntry = RegEx('-') + (BlankOrBreak || RegEx(EOF));
const RegEx Key = RegEx('?'),
KeyInFlow = RegEx('?') + BlankOrBreak;
const RegEx Value = RegEx(':'),
ValueInFlow = RegEx(':') + BlankOrBreak;
const RegEx Comment = RegEx('#');
// Plain scalar rules:
// . Cannot start with a blank.
// . Can never start with any of , [ ] { } # & * ! | > \' \" % @ `
// . In the block context - ? : must be not be followed with a space.
// . In the flow context ? : are illegal and - must not be followed with a space.
const RegEx PlainScalar = !(BlankOrBreak || RegEx(",[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx("-?:") + Blank)),
PlainScalarInFlow = !(BlankOrBreak || RegEx("?:,[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx('-') + Blank));
const RegEx IllegalColonInScalar = RegEx(':') + !BlankOrBreak;
const RegEx EndScalar = RegEx(':') + BlankOrBreak,
EndScalarInFlow = (RegEx(':') + BlankOrBreak) || RegEx(",:?[]{}");
}
namespace Keys
{
const char Comment = '#';
const char FlowSeqStart = '[';
const char FlowSeqEnd = ']';
const char FlowMapStart = '{';
const char FlowMapEnd = '}';
const char FlowEntry = ',';
const char BlockEntry = '-';
const char Key = '?';
const char Value = ':';
const char Alias = '*';
const char Anchor = '&';
const char Tag = '!';
@ -49,8 +76,6 @@ namespace YAML
void EatLineBreak();
bool IsWhitespaceToBeEaten(char ch);
bool IsLineBreak(char ch);
bool IsBlank(char ch);
bool IsDocumentStart();
bool IsDocumentEnd();
bool IsBlockEntry();

View file

@ -1,3 +1,5 @@
---
- milk
- eggs
- cheese and bread # this is really important!
- eggs # this is really important!
- cheese and bread
...

View file

@ -185,6 +185,10 @@
RelativePath=".\parser.cpp"
>
</File>
<File
RelativePath=".\regex.cpp"
>
</File>
<File
RelativePath=".\scalar.cpp"
>
@ -227,6 +231,10 @@
RelativePath=".\parser.h"
>
</File>
<File
RelativePath=".\regex.h"
>
</File>
<File
RelativePath=".\scalar.h"
>