Added quoted scalars (with escaping).

Refactored some common whitespace-parsing code in scanning both scalars. Implemented the flow collection tokens.
2025-04-28 13:28:01 +03:00 · 2008-06-27 23:11:46 +00:00 · 2008-06-27 23:11:46 +00:00 · 49a75b2d78
commit 49a75b2d78
parent a224c7818b
9 changed files with 307 additions and 54 deletions
--- a/exceptions.h
+++ b/exceptions.h
@ -12,4 +12,21 @@ namespace YAML
 	class IllegalMapValue: public Exception {};
 	class IllegalScalar: public Exception {};
 	class IllegalTabInScalar: public Exception {};
+	class DocIndicatorInQuote: public Exception {};
+	class EOFInQuote: public Exception {};
+	class UnknownEscapeSequence: public Exception {
+	public:
+		UnknownEscapeSequence(char ch_): ch(ch_) {}
+		char ch;
+	};
+	class NonHexNumber: public Exception {
+	public:
+		NonHexNumber(char ch_): ch(ch_) {}
+		char ch;
+	};
+	class InvalidUnicode: public Exception {
+	public:
+		InvalidUnicode(unsigned value_): value(value_) {}
+		unsigned value;
+	};
 }
--- a/exp.cpp
+++ b/exp.cpp
@ -0,0 +1,106 @@
+#include "exp.h"
+#include "exceptions.h"
+
+namespace YAML
+{
+	namespace Exp
+	{
+		unsigned ParseHex(std::string str)
+		{
+			unsigned value = 0;
+			for(unsigned i=0;i<str.size();i++) {
+				char ch = str[i];
+				int digit = 0;
+				if('a' <= ch && ch <= 'f')
+					digit = ch - 'a' + 10;
+				else if('A' <= ch && ch <= 'F')
+					digit = ch - 'A' + 10;
+				else if('0' <= ch && ch <= '9')
+					digit = ch - '0';
+				else
+					throw NonHexNumber(ch);
+
+				value = (value << 4) + digit;
+			}
+
+			return value;
+		}
+
+		std::string Str(char ch)
+		{
+			return std::string("") + ch;
+		}
+
+		// Escape
+		// . Translates the next 'codeLength' characters into a hex number and returns the result.
+		// . Throws if it's not actually hex.
+		std::string Escape(std::istream& in, int& length, int codeLength)
+		{
+			// grab string
+			length += codeLength;
+			std::string str;
+			for(int i=0;i<codeLength;i++)
+				str += in.get();
+
+			// get the value
+			unsigned value = ParseHex(str);
+
+			// legal unicode?
+			if((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF)
+				throw InvalidUnicode(value);
+
+			// now break it up into chars
+			if(value <= 0x7F)
+				return Str(value);
+			else if(value <= 0x7FF)
+				return Str(0xC0 + (value >> 6)) + Str(0x80 + (value & 0x3F));
+			else if(value <= 0xFFFF)
+				return Str(0xE0 + (value >> 12)) + Str(0x80 + ((value >> 6) & 0x3F)) + Str(0x80 + (value & 0x3F));
+			else
+				return Str(0xF0 + (value >> 18)) + Str(0x80 + ((value >> 12) & 0x3F)) +
+					Str(0x80 + ((value >> 6) & 0x3F)) + Str(0x80 + (value & 0x3F));
+		}
+
+		// Escape
+		// . Escapes the sequence starting 'in' (it must begin with a '\')
+		//   and returns the result.
+		// . Fills 'length' with how many characters we ate.
+		// . Throws if it's an unknown escape character.
+		std::string Escape(std::istream& in, int& length)
+		{
+			// slash + character
+			length = 2;
+
+			// eat slash
+			in.get();
+
+			// switch on escape character
+			char ch = in.get();
+			switch(ch) {
+				case '0': return "\0";
+				case 'a': return "\x07";
+				case 'b': return "\x08";
+				case 't':
+                case '\t': return "\x09";
+				case 'n': return "\x0A";
+				case 'v': return "\x0B";
+				case 'f': return "\x0C";
+				case 'r': return "\x0D";
+				case 'e': return "\x1B";
+				case ' ': return "\x20";
+				case '\"': return "\"";
+				case '\'': return "\'";
+				case '\\': return "\\";
+				case 'N': return "\xC2\x85";  // NEL (#x85)
+				case '_': return "\xC2\xA0";  // #xA0
+				case 'L': return "\xE2\x80\xA8";  // LS (#x2028)
+				case 'P': return "\xE2\x80\xA9";  // PS (#x2029)
+				case 'x': return Escape(in, length, 2);
+				case 'u': return Escape(in, length, 4);
+				case 'U': return Escape(in, length, 8);
+			}
+
+			throw UnknownEscapeSequence(ch);
+		}
+	}
+}
--- a/exp.h
+++ b/exp.h
@ -1,6 +1,8 @@
 #pragma once

 #include "regex.h"
+#include <string>
+#include <ios>

 namespace YAML
 {
@ -13,6 +15,8 @@ namespace YAML
 		const RegEx Blank = RegEx(' ') || RegEx('\t');
 		const RegEx Break = RegEx('\n');
 		const RegEx BlankOrBreak = Blank || Break;
+		const RegEx Digit = RegEx('0', '9');
+		const RegEx Hex = Digit || RegEx('A', 'F') || RegEx('a', 'f');

 		// actual tags

@ -30,11 +34,17 @@ namespace YAML
 		// . Can never start with any of , [ ] { } # & * ! | > \' \" % @ `
 		// . In the block context - ? : must be not be followed with a space.
 		// . In the flow context ? : are illegal and - must not be followed with a space.
-		const RegEx PlainScalar = !(BlankOrBreak || RegEx(",[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx("-?:") + Blank)),
+		const RegEx PlainScalar = !(BlankOrBreak || RegEx(",[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx("-?:", REGEX_OR) + Blank)),
 	                PlainScalarInFlow = !(BlankOrBreak || RegEx("?:,[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx('-') + Blank));
 		const RegEx IllegalColonInScalar = RegEx(':') + !BlankOrBreak;
 		const RegEx EndScalar = RegEx(':') + BlankOrBreak,
-		            EndScalarInFlow = (RegEx(':') + BlankOrBreak) || RegEx(",:?[]{}");
+		            EndScalarInFlow = (RegEx(':') + BlankOrBreak) || RegEx(",:?[]{}", REGEX_OR);
+
+		const RegEx EscSingleQuote = RegEx("\'\'");
+		const RegEx EscBreak = RegEx('\\') + Break;
+
+		// and some functions
+		std::string Escape(std::istream& in, int& length);
 	}

 	namespace Keys
--- a/regex.cpp
+++ b/regex.cpp
@ -35,7 +35,7 @@ namespace YAML
 	RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op), m_pOp(0)
 	{
 		for(unsigned i=0;i<str.size();i++)
-			m_params.push_back(RegEx(str[0]));
+			m_params.push_back(RegEx(str[i]));

 		SetOp();
 	}
--- a/scanner.cpp
+++ b/scanner.cpp
@ -286,6 +286,22 @@ namespace YAML
 		}
 	}

+	// IncreaseFlowLevel
+	void Scanner::IncreaseFlowLevel()
+	{
+		// TODO: Push simple key
+		m_flowLevel++;
+	}
+
+	// DecreaseFlowLevel
+	void Scanner::DecreaseFlowLevel()
+	{
+		if(m_flowLevel > 0) {
+			m_flowLevel--;
+			// TODO: Pop simple key
+		}
+	}
+
 	// temporary function for testing
 	void Scanner::Scan()
 	{
--- a/scanner.h
+++ b/scanner.h
@ -20,6 +20,8 @@ namespace YAML
 		void ScanToNextToken();
 		void PushIndentTo(int column, bool sequence);
 		void PopIndentTo(int column);
+		void IncreaseFlowLevel();
+		void DecreaseFlowLevel();

 		void Scan();

@ -37,6 +39,16 @@ namespace YAML
 		bool IsValue();
 		bool IsPlainScalar();

+		struct WhitespaceInfo {
+			WhitespaceInfo();
+			void AddBlank(char ch);
+			void AddBreak(const std::string& line);
+			std::string Join();
+
+			bool leadingBlanks;
+			std::string whitespace, leadingBreaks, trailingBreaks;
+		};
+
 		template <typename T> void ScanAndEnqueue(T *pToken);
 		template <typename T> T *ScanToken(T *pToken);

--- a/scantoken.cpp
+++ b/scantoken.cpp
@ -64,8 +64,8 @@ namespace YAML
 	template <> FlowSeqStartToken *Scanner::ScanToken(FlowSeqStartToken *pToken)
 	{
 		// TODO: "save simple key"
-		// TODO: increase flow level

+		IncreaseFlowLevel();
 		m_simpleKeyAllowed = true;

 		// eat
@ -77,8 +77,8 @@ namespace YAML
 	template <> FlowMapStartToken *Scanner::ScanToken(FlowMapStartToken *pToken)
 	{
 		// TODO: "save simple key"
-		// TODO: increase flow level

+		IncreaseFlowLevel();
 		m_simpleKeyAllowed = true;

 		// eat
@ -90,8 +90,8 @@ namespace YAML
 	template <> FlowSeqEndToken *Scanner::ScanToken(FlowSeqEndToken *pToken)
 	{
 		// TODO: "remove simple key"
-		// TODO: decrease flow level

+		DecreaseFlowLevel();
 		m_simpleKeyAllowed = false;

 		// eat
@ -103,8 +103,8 @@ namespace YAML
 	template <> FlowMapEndToken *Scanner::ScanToken(FlowMapEndToken *pToken)
 	{
 		// TODO: "remove simple key"
-		// TODO: decrease flow level

+		DecreaseFlowLevel();
 		m_simpleKeyAllowed = false;

 		// eat
@ -210,8 +210,8 @@ namespace YAML
 		m_simpleKeyAllowed = false;

 		// now eat and store the scalar
-		std::string scalar, whitespace, leadingBreaks, trailingBreaks;
-		bool leadingBlanks = false;
+		std::string scalar;
+		WhitespaceInfo info;

 		while(INPUT) {
 			// doc start/end tokens
@ -234,26 +234,6 @@ namespace YAML
 				if(m_flowLevel == 0 && Exp::EndScalar.Matches(INPUT))
 					break;

-				// join whitespace
-				if(leadingBlanks) {
-					if(Exp::Break.Matches(leadingBreaks)) {
-						// fold line break?
-						if(trailingBreaks.empty())
-							scalar += ' ';
-						else
-							scalar += trailingBreaks;
-					} else {
-						scalar += leadingBreaks + trailingBreaks;
-					}
-
-					leadingBlanks = false;
-					leadingBreaks = "";
-					trailingBreaks = "";
-				} else if(!whitespace.empty()) {
-					scalar += whitespace;
-					whitespace = "";
-				}
-
 				// finally, read the character!
 				scalar += GetChar();
 			}
@ -266,37 +246,29 @@ namespace YAML
 			while(INPUT && Exp::BlankOrBreak.Matches(INPUT)) {
 				if(Exp::Blank.Matches(INPUT)) {
 					// can't use tabs as indentation! only spaces!
-					if(INPUT.peek() == '\t' && leadingBlanks && m_column <= m_indents.top())
+					if(INPUT.peek() == '\t' && info.leadingBlanks && m_column <= m_indents.top())
 						throw IllegalTabInScalar();

-					// maybe store this character
-					if(!leadingBlanks)
-						whitespace += GetChar();
-					else
-						Eat(1);
-				} else {
+					info.AddBlank(GetChar());
+				} else	{
 					// we know it's a line break; see how many characters to read
 					int n = Exp::Break.Match(INPUT);
 					std::string line = GetChar(n);
-
-					// where to store this character?
-					if(!leadingBlanks) {
-						leadingBlanks = true;
-						whitespace = "";
-						leadingBreaks += line;
-					} else
-						trailingBreaks += line;
+					info.AddBreak(line);
 				}
 			}

-			// and finally break if we're below the indentation level
+			// break if we're below the indentation level
 			if(m_flowLevel == 0 && m_column <= m_indents.top())
 				break;
+
+			// finally join whitespace
+			scalar += info.Join();
 		}

 		// now modify our token
 		pToken->value = scalar;
-		if(leadingBlanks)
+		if(info.leadingBlanks)
 			m_simpleKeyAllowed = true;

 		return pToken;
@ -305,6 +277,128 @@ namespace YAML
 	// QuotedScalarToken
 	template <> QuotedScalarToken *Scanner::ScanToken(QuotedScalarToken *pToken)
 	{
+		// TODO: "save simple key"
+
+		m_simpleKeyAllowed = false;
+
+		// eat single or double quote
+		char quote = GetChar();
+		bool single = (quote == '\'');
+
+		// now eat and store the scalar
+		std::string scalar;
+		WhitespaceInfo info;
+
+		while(INPUT) {
+			if(IsDocumentStart() || IsDocumentEnd())
+				throw DocIndicatorInQuote();
+
+			if(INPUT.peek() == EOF)
+				throw EOFInQuote();
+
+			// first eat non-blanks
+			while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
+				// escaped single quote?
+				if(single && Exp::EscSingleQuote.Matches(INPUT)) {
+					int n = Exp::EscSingleQuote.Match(INPUT);
+					scalar += GetChar(n);
+					continue;
+				}
+
+				// is the quote ending?
+				if(INPUT.peek() == (single ? '\'' : '\"'))
+					break;
+
+				// escaped newline?
+				if(Exp::EscBreak.Matches(INPUT))
+					break;
+
+				// other escape sequence
+				if(INPUT.peek() == '\\') {
+					int length = 0;
+					scalar += Exp::Escape(INPUT, length);
+					m_column += length;
+					continue;
+				}
+
+				// and finally, just add the damn character
+				scalar += GetChar();
+			}
+
+			// is the quote ending?
+			if(INPUT.peek() == (single ? '\'' : '\"')) {
+				// eat and go
+				GetChar();
+				break;
+			}
+
+			// now we eat blanks
+			while(Exp::BlankOrBreak.Matches(INPUT)) {
+				if(Exp::Blank.Matches(INPUT)) {
+					info.AddBlank(GetChar());
+				} else {
+					// we know it's a line break; see how many characters to read
+					int n = Exp::Break.Match(INPUT);
+					std::string line = GetChar(n);
+					info.AddBreak(line);
+				}
+			}
+
+			// and finally join the whitespace
+			scalar += info.Join();
+		}
+
+		pToken->value = scalar;
 		return pToken;
 	}
+
+	//////////////////////////////////////////////////////////
+	// WhitespaceInfo stuff
+
+	Scanner::WhitespaceInfo::WhitespaceInfo(): leadingBlanks(false)
+	{
+	}
+
+	void Scanner::WhitespaceInfo::AddBlank(char ch)
+	{
+		if(!leadingBlanks)
+			whitespace += ch;
+	}
+
+	void Scanner::WhitespaceInfo::AddBreak(const std::string& line)
+	{
+		// where to store this character?
+		if(!leadingBlanks) {
+			leadingBlanks = true;
+			whitespace = "";
+			leadingBreaks += line;
+		} else
+			trailingBreaks += line;
+	}
+
+	std::string Scanner::WhitespaceInfo::Join()
+	{
+		std::string ret;
+
+		if(leadingBlanks) {
+			if(Exp::Break.Matches(leadingBreaks)) {
+				// fold line break?
+				if(trailingBreaks.empty())
+					ret = " ";
+				else
+					ret = trailingBreaks;
+			} else {
+				ret = leadingBreaks + trailingBreaks;
+			}
+
+			leadingBlanks = false;
+			leadingBreaks = "";
+			trailingBreaks = "";
+		} else if(!whitespace.empty()) {
+			ret = whitespace;
+			whitespace = "";
+		}
+
+		return ret;
+	}
 }
--- a/test.yaml
+++ b/test.yaml
@ -1,10 +1,4 @@
 ---
- green
-  eggs,
-  and
-  ham!
- eggs             # this is really important!
- - cheddar cheese
-  - american cheese
- bread
+- milk and eggs
+- [cheddar, american, swiss]
 ...
--- a/yaml-reader.vcproj
+++ b/yaml-reader.vcproj
@ -169,6 +169,10 @@
 				RelativePath=".\document.cpp"
 				>
 			</File>
+			<File
+				RelativePath=".\exp.cpp"
+				>
+			</File>
 			<File
 				RelativePath=".\main.cpp"
 				>