Imported Upstream version 0.26.0

This commit is contained in:
Bret Curtis 2013-10-17 16:37:22 +02:00
commit 9a2b6c69b6
1398 changed files with 212217 additions and 0 deletions

View file

@ -0,0 +1,8 @@
tables_gen.hpp: gen_iconv
./gen_iconv > tables_gen.hpp
gen_iconv: gen_iconv.cpp
g++ -Wall $^ -o $@
clean:
rm -f ./gen_iconv

View file

@ -0,0 +1,118 @@
// This program generates the file tables_gen.hpp
#include <iostream>
using namespace std;
#include <iconv.h>
#include <cassert>
void tab() { cout << " "; }
// write one number with a space in front of it and a comma after it
void num(char i, bool last)
{
// Convert i to its integer value, i.e. -128 to 127. Printing it directly
// would result in non-printable characters in the source code, which is bad.
cout << " " << static_cast<int>(i);
if(!last) cout << ",";
}
// Write one table entry (UTF8 value), 1-5 bytes
void writeChar(char *value, int length, bool last, const std::string &comment="")
{
assert(length >= 1 && length <= 5);
tab();
num(length, false);
for(int i=0;i<5;i++)
num(value[i], last && i==4);
if(comment != "")
cout << " // " << comment;
cout << endl;
}
// What to write on missing characters
void writeMissing(bool last)
{
// Just write a space character
char value[5];
value[0] = ' ';
for(int i=1; i<5; i++)
value[i] = 0;
writeChar(value, 1, last, "not part of this charset");
}
int write_table(const std::string &charset, const std::string &tableName)
{
// Write table header
cout << "static signed char " << tableName << "[] =\n{\n";
// Open conversion system
iconv_t cd = iconv_open ("UTF-8", charset.c_str());
// Convert each character from 0 to 255
for(int i=0; i<256; i++)
{
bool last = (i==255);
char input = i;
char *iptr = &input;
size_t ileft = 1;
char output[5];
for(int k=0; k<5; k++) output[k] = 0;
char *optr = output;
size_t oleft = 5;
size_t res = iconv(cd, &iptr, &ileft, &optr, &oleft);
if(res) writeMissing(last);
else writeChar(output, 5-oleft, last);
}
iconv_close (cd);
// Finish table
cout << "};\n";
return 0;
}
int main()
{
// Write header guard
cout << "#ifndef COMPONENTS_TOUTF8_TABLE_GEN_H\n#define COMPONENTS_TOUTF8_TABLE_GEN_H\n\n";
// Write namespace
cout << "namespace ToUTF8\n{\n\n";
// Central European and Eastern European languages that use Latin script, such as
// Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian, Serbian (Latin script), Romanian and Albanian.
cout << "\n/// Central European and Eastern European languages that use Latin script,"
"\n/// such as Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian,"
"\n/// Serbian (Latin script), Romanian and Albanian."
"\n";
write_table("WINDOWS-1250", "windows_1250");
// Cyrillic alphabet such as Russian, Bulgarian, Serbian Cyrillic and other languages
cout << "\n/// Cyrillic alphabet such as Russian, Bulgarian, Serbian Cyrillic"
"\n/// and other languages"
"\n";
write_table("WINDOWS-1251", "windows_1251");
// English
cout << "\n/// Latin alphabet used by English and some other Western languages"
"\n";
write_table("WINDOWS-1252", "windows_1252");
write_table("CP437", "cp437");
// Close namespace
cout << "\n}\n\n";
// Close header guard
cout << "#endif\n\n";
return 0;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,4 @@
original: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
converted: Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?
original: Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.
converted: Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.

View file

@ -0,0 +1,18 @@
#!/bin/bash
make || exit
mkdir -p output
PROGS=*_test
for a in $PROGS; do
if [ -f "output/$a.out" ]; then
echo "Running $a:"
./$a | diff output/$a.out -
else
echo "Creating $a.out"
./$a > "output/$a.out"
git add "output/$a.out"
fi
done

View file

@ -0,0 +1 @@
Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.

View file

@ -0,0 +1 @@
Vous lui donnez le gâteau sans protester avant daller chercher tous vos amis et de revenir vous venger.

View file

@ -0,0 +1 @@
Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?

View file

@ -0,0 +1 @@
Без вопросов отдаете ему рулет, зная, что позже вы сможете привести с собой своих друзей и тогда он получит по заслугам?

View file

@ -0,0 +1,59 @@
#include <iostream>
#include <fstream>
#include <cassert>
#include <stdexcept>
#include "../to_utf8.hpp"
std::string getFirstLine(const std::string &filename);
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
const std::string &utf8File);
/// Test character encoding conversion to and from UTF-8
void testEncoder(ToUTF8::FromType encoding, const std::string &legacyEncFile,
const std::string &utf8File)
{
// get some test data
std::string legacyEncLine = getFirstLine(legacyEncFile);
std::string utf8Line = getFirstLine(utf8File);
// create an encoder for specified character encoding
ToUTF8::Utf8Encoder encoder (encoding);
// convert text to UTF-8
std::string convertedUtf8Line = encoder.getUtf8(legacyEncLine);
std::cout << "original: " << utf8Line << std::endl;
std::cout << "converted: " << convertedUtf8Line << std::endl;
// check correctness
assert(convertedUtf8Line == utf8Line);
// convert UTF-8 text to legacy encoding
std::string convertedLegacyEncLine = encoder.getLegacyEnc(utf8Line);
// check correctness
assert(convertedLegacyEncLine == legacyEncLine);
}
std::string getFirstLine(const std::string &filename)
{
std::string line;
std::ifstream text (filename.c_str());
if (!text.is_open())
{
throw std::runtime_error("Unable to open file " + filename);
}
std::getline(text, line);
text.close();
return line;
}
int main()
{
testEncoder(ToUTF8::WINDOWS_1251, "test_data/russian-win1251.txt", "test_data/russian-utf8.txt");
testEncoder(ToUTF8::WINDOWS_1252, "test_data/french-win1252.txt", "test_data/french-utf8.txt");
return 0;
}

View file

@ -0,0 +1,344 @@
#include "to_utf8.hpp"
#include <vector>
#include <cassert>
#include <iostream>
#include <iomanip>
/* This file contains the code to translate from WINDOWS-1252 (native
charset used in English version of Morrowind) to UTF-8. The library
is designed to be extened to support more source encodings later,
which means that we may add support for Russian, Polish and Chinese
files and so on.
The code does not depend on any external library at
runtime. Instead, it uses a pregenerated table made with iconv (see
gen_iconv.cpp and the Makefile) which is located in tables_gen.hpp.
This is both faster and uses less dependencies. The tables would
only need to be regenerated if we are adding support more input
encodings. As such, there is no need to make the generator code
platform independent.
The library is optimized for the case of pure ASCII input strings,
which is the vast majority of cases at least for the English
version. A test of my version of Morrowind.esm got 130 non-ASCII vs
236195 ASCII strings, or less than 0.06% of strings containing
non-ASCII characters.
To optmize for this, ff the first pass of the string does not find
any non-ASCII characters, the entire string is passed along without
any modification.
Most of the non-ASCII strings are books, and are quite large. (The
non-ASCII characters are typically starting and ending quotation
marks.) Within these, almost all the characters are ASCII. For this
purpose, the library is also optimized for mostly-ASCII contents
even in the cases where some conversion is necessary.
*/
// Generated tables
#include "tables_gen.hpp"
using namespace ToUTF8;
Utf8Encoder::Utf8Encoder(const FromType sourceEncoding):
mOutput(50*1024)
{
switch (sourceEncoding)
{
case ToUTF8::WINDOWS_1252:
{
translationArray = ToUTF8::windows_1252;
break;
}
case ToUTF8::WINDOWS_1250:
{
translationArray = ToUTF8::windows_1250;
break;
}
case ToUTF8::WINDOWS_1251:
{
translationArray = ToUTF8::windows_1251;
break;
}
case ToUTF8::CP437:
{
translationArray = ToUTF8::cp437;
break;
}
default:
{
assert(0);
}
}
}
std::string Utf8Encoder::getUtf8(const char* input, size_t size)
{
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength(input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(outlen);
char *out = &mOutput[0];
// Translate
while (*input)
copyFromArray(*(input++), out);
// Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(mOutput.size() > outlen);
assert(mOutput[outlen] == 0);
// Return a string
return std::string(&mOutput[0], outlen);
}
std::string Utf8Encoder::getLegacyEnc(const char *input, size_t size)
{
// Double check that the input string stops at some point (it might
// contain zero terminators before this, inside its own data, which
// is also ok.)
assert(input[size] == 0);
// TODO: The rest of this function is designed for single-character
// input encodings only. It also assumes that the input the input
// encoding shares its first 128 values (0-127) with ASCII. These
// conditions must be checked again if you add more input encodings
// later.
// Compute output length, and check for pure ascii input at the same
// time.
bool ascii;
size_t outlen = getLength2(input, ascii);
// If we're pure ascii, then don't bother converting anything.
if(ascii)
return std::string(input, outlen);
// Make sure the output is large enough
resize(outlen);
char *out = &mOutput[0];
// Translate
while(*input)
copyFromArray2(input, out);
// Make sure that we wrote the correct number of bytes
assert((out-&mOutput[0]) == (int)outlen);
// And make extra sure the output is null terminated
assert(mOutput.size() > outlen);
assert(mOutput[outlen] == 0);
// Return a string
return std::string(&mOutput[0], outlen);
}
// Make sure the output vector is large enough for 'size' bytes,
// including a terminating zero after it.
void Utf8Encoder::resize(size_t size)
{
if (mOutput.size() <= size)
// Add some extra padding to reduce the chance of having to resize
// again later.
mOutput.resize(3*size);
// And make sure the string is zero terminated
mOutput[size] = 0;
}
/** Get the total length length needed to decode the given string with
the given translation array. The arrays are encoded with 6 bytes
per character, with the first giving the length and the next 5 the
actual data.
The function serves a dual purpose for optimization reasons: it
checks if the input is pure ascii (all values are <= 127). If this
is the case, then the ascii parameter is set to true, and the
caller can optimize for this case.
*/
size_t Utf8Encoder::getLength(const char* input, bool &ascii)
{
ascii = true;
size_t len = 0;
const char* ptr = input;
unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost
// always the entire string.)
while (inp && inp < 128)
inp = *(++ptr);
len += (ptr-input);
// If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string.
if (inp)
{
ascii = false;
while (inp)
{
// Find the translated length of this character in the
// lookup table.
len += translationArray[inp*6];
inp = *(++ptr);
}
}
return len;
}
// Translate one character 'ch' using the translation array 'arr', and
// advance the output pointer accordingly.
void Utf8Encoder::copyFromArray(unsigned char ch, char* &out)
{
// Optimize for ASCII values
if (ch < 128)
{
*(out++) = ch;
return;
}
const signed char *in = translationArray + ch*6;
int len = *(in++);
for (int i=0; i<len; i++)
*(out++) = *(in++);
}
size_t Utf8Encoder::getLength2(const char* input, bool &ascii)
{
ascii = true;
size_t len = 0;
const char* ptr = input;
unsigned char inp = *ptr;
// Do away with the ascii part of the string first (this is almost
// always the entire string.)
while (inp && inp < 128)
inp = *(++ptr);
len += (ptr-input);
// If we're not at the null terminator at this point, then there
// were some non-ascii characters to deal with. Go to slow-mode for
// the rest of the string.
if (inp)
{
ascii = false;
while(inp)
{
len += 1;
// Find the translated length of this character in the
// lookup table.
switch(inp)
{
case 0xe2: len -= 2; break;
case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len -= 1; break;
}
inp = *(++ptr);
}
}
return len;
}
void Utf8Encoder::copyFromArray2(const char*& chp, char* &out)
{
unsigned char ch = *(chp++);
// Optimize for ASCII values
if (ch < 128)
{
*(out++) = ch;
return;
}
int len = 1;
switch (ch)
{
case 0xe2: len = 3; break;
case 0xc2:
case 0xcb:
case 0xc4:
case 0xc6:
case 0xc3:
case 0xd0:
case 0xd1:
case 0xd2:
case 0xc5: len = 2; break;
}
if (len == 1) // There is no 1 length utf-8 glyph that is not 0x20 (empty space)
{
*(out++) = ch;
return;
}
unsigned char ch2 = *(chp++);
unsigned char ch3 = '\0';
if (len == 3)
ch3 = *(chp++);
for (int i = 128; i < 256; i++)
{
unsigned char b1 = translationArray[i*6 + 1], b2 = translationArray[i*6 + 2], b3 = translationArray[i*6 + 3];
if (b1 == ch && b2 == ch2 && (len != 3 || b3 == ch3))
{
*(out++) = (char)i;
return;
}
}
std::cout << "Could not find glyph " << std::hex << (int)ch << " " << (int)ch2 << " " << (int)ch3 << std::endl;
*(out++) = ch; // Could not find glyph, just put whatever
}
ToUTF8::FromType ToUTF8::calculateEncoding(const std::string& encodingName)
{
if (encodingName == "win1250")
return ToUTF8::WINDOWS_1250;
else if (encodingName == "win1251")
return ToUTF8::WINDOWS_1251;
else
return ToUTF8::WINDOWS_1252;
}
std::string ToUTF8::encodingUsingMessage(const std::string& encodingName)
{
if (encodingName == "win1250")
return "Using Central and Eastern European font encoding.";
else if (encodingName == "win1251")
return "Using Cyrillic font encoding.";
else
return "Using default (English) font encoding.";
}

View file

@ -0,0 +1,55 @@
#ifndef COMPONENTS_TOUTF8_H
#define COMPONENTS_TOUTF8_H
#include <string>
#include <cstring>
#include <vector>
namespace ToUTF8
{
// These are all the currently supported code pages
enum FromType
{
WINDOWS_1250, // Central ane Eastern European languages
WINDOWS_1251, // Cyrillic languages
WINDOWS_1252, // Used by English version of Morrowind (and
// probably others)
CP437 // Used for fonts (*.fnt) if data files encoding is 1252. Otherwise, uses the same encoding as the data files.
};
FromType calculateEncoding(const std::string& encodingName);
std::string encodingUsingMessage(const std::string& encodingName);
// class
class Utf8Encoder
{
public:
Utf8Encoder(FromType sourceEncoding);
// Convert to UTF8 from the previously given code page.
std::string getUtf8(const char *input, size_t size);
inline std::string getUtf8(const std::string &str)
{
return getUtf8(str.c_str(), str.size());
}
std::string getLegacyEnc(const char *input, size_t size);
inline std::string getLegacyEnc(const std::string &str)
{
return getLegacyEnc(str.c_str(), str.size());
}
private:
void resize(size_t size);
size_t getLength(const char* input, bool &ascii);
void copyFromArray(unsigned char chp, char* &out);
size_t getLength2(const char* input, bool &ascii);
void copyFromArray2(const char*& chp, char* &out);
std::vector<char> mOutput;
signed char* translationArray;
};
}
#endif