#include "Scanner.h" #include #include // string handling, wide character wchar_t* coco_string_create(const wchar_t* value) { return coco_string_create(value, 0); } wchar_t* coco_string_create(const wchar_t *value, int startIndex) { int valueLen = 0; int len = 0; if (value) { valueLen = wcslen(value); len = valueLen - startIndex; } return coco_string_create(value, startIndex, len); } wchar_t* coco_string_create(const wchar_t *value, int startIndex, int length) { int len = 0; wchar_t* data; if (value) { len = length; } data = new wchar_t[len + 1]; wcsncpy(data, &(value[startIndex]), len); data[len] = 0; return data; } wchar_t* coco_string_create_upper(const wchar_t* data) { if (!data) { return NULL; } int dataLen = 0; if (data) { dataLen = wcslen(data); } wchar_t *newData = new wchar_t[dataLen + 1]; for (int i = 0; i <= dataLen; i++) { if ((L'a' <= data[i]) && (data[i] <= L'z')) { newData[i] = data[i] + (L'A' - L'a'); } else { newData[i] = data[i]; } } newData[dataLen] = L'\0'; return newData; } wchar_t* coco_string_create_lower(const wchar_t* data) { if (!data) { return NULL; } int dataLen = wcslen(data); return coco_string_create_lower(data, 0, dataLen); } wchar_t* coco_string_create_lower(const wchar_t* data, int startIndex, int dataLen) { if (!data) { return NULL; } wchar_t* newData = new wchar_t[dataLen + 1]; for (int i = 0; i <= dataLen; i++) { wchar_t ch = data[startIndex + i]; if ((L'A' <= ch) && (ch <= L'Z')) { newData[i] = ch - (L'A' - L'a'); } else { newData[i] = ch; } } newData[dataLen] = L'\0'; return newData; } wchar_t* coco_string_create_append(const wchar_t* data1, const wchar_t* data2) { wchar_t* data; int data1Len = 0; int data2Len = 0; if (data1) { data1Len = wcslen(data1); } if (data2) {data2Len = wcslen(data2); } data = new wchar_t[data1Len + data2Len + 1]; if (data1) { wcscpy(data, data1); } if (data2) { wcscpy(data + data1Len, data2); } data[data1Len + data2Len] = 0; return data; } wchar_t* coco_string_create_append(const wchar_t *target, const wchar_t appendix) { int targetLen = coco_string_length(target); wchar_t* data = new wchar_t[targetLen + 2]; wcsncpy(data, target, targetLen); data[targetLen] = appendix; data[targetLen + 1] = 0; return data; } void coco_string_delete(wchar_t* &data) { delete [] data; data = NULL; } int coco_string_length(const wchar_t* data) { if (data) { return wcslen(data); } return 0; } bool coco_string_endswith(const wchar_t* data, const wchar_t *end) { int dataLen = wcslen(data); int endLen = wcslen(end); return (endLen <= dataLen) && (wcscmp(data + dataLen - endLen, end) == 0); } int coco_string_indexof(const wchar_t* data, const wchar_t value) { const wchar_t* chr = wcschr(data, value); if (chr) { return (chr-data); } return -1; } int coco_string_lastindexof(const wchar_t* data, const wchar_t value) { const wchar_t* chr = wcsrchr(data, value); if (chr) { return (chr-data); } return -1; } void coco_string_merge(wchar_t* &target, const wchar_t* appendix) { if (!appendix) { return; } wchar_t* data = coco_string_create_append(target, appendix); delete [] target; target = data; } bool coco_string_equal(const wchar_t* data1, const wchar_t* data2) { return wcscmp( data1, data2 ) == 0; } int coco_string_compareto(const wchar_t* data1, const wchar_t* data2) { return wcscmp(data1, data2); } int coco_string_hash(const wchar_t *data) { int h = 0; if (!data) { return 0; } while (*data != 0) { h = (h * 7) ^ *data; ++data; } if (h < 0) { h = -h; } return h; } // string handling, ascii character wchar_t* coco_string_create(const char* value) { int len = 0; if (value) { len = strlen(value); } wchar_t* data = new wchar_t[len + 1]; for (int i = 0; i < len; ++i) { data[i] = (wchar_t) value[i]; } data[len] = 0; return data; } char* coco_string_create_char(const wchar_t *value) { int len = coco_string_length(value); char *res = new char[len + 1]; for (int i = 0; i < len; ++i) { res[i] = (char) value[i]; } res[len] = 0; return res; } void coco_string_delete(char* &data) { delete [] data; data = NULL; } Token::Token() { kind = 0; pos = 0; col = 0; line = 0; val = NULL; next = NULL; } Token::~Token() { coco_string_delete(val); } Buffer::Buffer(FILE* s, bool isUserStream) { // ensure binary read on windows #if _MSC_VER >= 1300 _setmode(_fileno(s), _O_BINARY); #endif stream = s; this->isUserStream = isUserStream; if (CanSeek()) { fseek(s, 0, SEEK_END); fileLen = ftell(s); fseek(s, 0, SEEK_SET); bufLen = (fileLen < COCO_MAX_BUFFER_LENGTH) ? fileLen : COCO_MAX_BUFFER_LENGTH; bufStart = INT_MAX; // nothing in the buffer so far } else { fileLen = bufLen = bufStart = 0; } bufCapacity = (bufLen>0) ? bufLen : COCO_MIN_BUFFER_LENGTH; buf = new unsigned char[bufCapacity]; if (fileLen > 0) SetPos(0); // setup buffer to position 0 (start) else bufPos = 0; // index 0 is already after the file, thus Pos = 0 is invalid if (bufLen == fileLen && CanSeek()) Close(); } Buffer::Buffer(Buffer *b) { buf = b->buf; bufCapacity = b->bufCapacity; b->buf = NULL; bufStart = b->bufStart; bufLen = b->bufLen; fileLen = b->fileLen; bufPos = b->bufPos; stream = b->stream; b->stream = NULL; isUserStream = b->isUserStream; } Buffer::Buffer(const unsigned char* buf, int len) { this->buf = new unsigned char[len]; memcpy(this->buf, buf, len*sizeof(unsigned char)); bufStart = 0; bufCapacity = bufLen = len; fileLen = len; bufPos = 0; stream = NULL; isUserStream = false; } Buffer::~Buffer() { Close(); if (buf != NULL) { delete [] buf; buf = NULL; } } void Buffer::Close() { if (!isUserStream && stream != NULL) { fclose(stream); stream = NULL; } } int Buffer::Read() { if (bufPos < bufLen) { return buf[bufPos++]; } else if (GetPos() < fileLen) { SetPos(GetPos()); // shift buffer start to Pos return buf[bufPos++]; } else if ((stream != NULL) && !CanSeek() && (ReadNextStreamChunk() > 0)) { return buf[bufPos++]; } else { return EoF; } } int Buffer::Peek() { int curPos = GetPos(); int ch = Read(); SetPos(curPos); return ch; } // beg .. begin, zero-based, inclusive, in byte // end .. end, zero-based, exclusive, in byte wchar_t* Buffer::GetString(int beg, int end) { int len = 0; wchar_t *buf = new wchar_t[end - beg]; int oldPos = GetPos(); SetPos(beg); while (GetPos() < end) buf[len++] = (wchar_t) Read(); SetPos(oldPos); wchar_t *res = coco_string_create(buf, 0, len); coco_string_delete(buf); return res; } int Buffer::GetPos() { return bufPos + bufStart; } void Buffer::SetPos(int value) { if ((value >= fileLen) && (stream != NULL) && !CanSeek()) { // Wanted position is after buffer and the stream // is not seek-able e.g. network or console, // thus we have to read the stream manually till // the wanted position is in sight. while ((value >= fileLen) && (ReadNextStreamChunk() > 0)); } if ((value < 0) || (value > fileLen)) { wprintf(L"--- buffer out of bounds access, position: %d\n", value); exit(1); } if ((value >= bufStart) && (value < (bufStart + bufLen))) { // already in buffer bufPos = value - bufStart; } else if (stream != NULL) { // must be swapped in fseek(stream, value, SEEK_SET); bufLen = fread(buf, sizeof(unsigned char), bufCapacity, stream); bufStart = value; bufPos = 0; } else { bufPos = fileLen - bufStart; // make Pos return fileLen } } // Read the next chunk of bytes from the stream, increases the buffer // if needed and updates the fields fileLen and bufLen. // Returns the number of bytes read. int Buffer::ReadNextStreamChunk() { int free = bufCapacity - bufLen; if (free == 0) { // in the case of a growing input stream // we can neither seek in the stream, nor can we // foresee the maximum length, thus we must adapt // the buffer size on demand. bufCapacity = bufLen * 2; unsigned char *newBuf = new unsigned char[bufCapacity]; memcpy(newBuf, buf, bufLen*sizeof(unsigned char)); delete [] buf; buf = newBuf; free = bufLen; } int read = fread(buf + bufLen, sizeof(unsigned char), free, stream); if (read > 0) { fileLen = bufLen = (bufLen + read); return read; } // end of stream reached return 0; } bool Buffer::CanSeek() { return (stream != NULL) && (ftell(stream) != -1); } int UTF8Buffer::Read() { int ch; do { ch = Buffer::Read(); // until we find a utf8 start (0xxxxxxx or 11xxxxxx) } while ((ch >= 128) && ((ch & 0xC0) != 0xC0) && (ch != EoF)); if (ch < 128 || ch == EoF) { // nothing to do, first 127 chars are the same in ascii and utf8 // 0xxxxxxx or end of file character } else if ((ch & 0xF0) == 0xF0) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx int c1 = ch & 0x07; ch = Buffer::Read(); int c2 = ch & 0x3F; ch = Buffer::Read(); int c3 = ch & 0x3F; ch = Buffer::Read(); int c4 = ch & 0x3F; ch = (((((c1 << 6) | c2) << 6) | c3) << 6) | c4; } else if ((ch & 0xE0) == 0xE0) { // 1110xxxx 10xxxxxx 10xxxxxx int c1 = ch & 0x0F; ch = Buffer::Read(); int c2 = ch & 0x3F; ch = Buffer::Read(); int c3 = ch & 0x3F; ch = (((c1 << 6) | c2) << 6) | c3; } else if ((ch & 0xC0) == 0xC0) { // 110xxxxx 10xxxxxx int c1 = ch & 0x1F; ch = Buffer::Read(); int c2 = ch & 0x3F; ch = (c1 << 6) | c2; } return ch; } Scanner::Scanner(const unsigned char* buf, int len) { buffer = new Buffer(buf, len); Init(); } Scanner::Scanner(const wchar_t* fileName) { FILE* stream; char *chFileName = coco_string_create_char(fileName); if ((stream = fopen(chFileName, "rb")) == NULL) { wprintf(L"--- Cannot open file %ls\n", fileName); exit(1); } coco_string_delete(chFileName); buffer = new Buffer(stream, false); Init(); } Scanner::Scanner(FILE* s) { buffer = new Buffer(s, true); Init(); } Scanner::~Scanner() { char* cur = (char*) firstHeap; while(cur != NULL) { cur = *(char**) (cur + COCO_HEAP_BLOCK_SIZE); free(firstHeap); firstHeap = cur; } delete [] tval; delete buffer; } void Scanner::Init() { EOL = '\n'; eofSym = 0; maxT = 21; noSym = 21; int i; for (i = 46; i <= 46; ++i) start.set(i, 3); for (i = 65; i <= 90; ++i) start.set(i, 3); for (i = 95; i <= 95; ++i) start.set(i, 3); for (i = 97; i <= 122; ++i) start.set(i, 3); for (i = 48; i <= 57; ++i) start.set(i, 9); start.set(40, 1); start.set(41, 2); start.set(34, 4); start.set(45, 10); start.set(61, 19); start.set(33, 20); start.set(60, 21); start.set(62, 22); start.set(35, 17); start.set(Buffer::EoF, -1); keywords.set(L"between", 18); keywords.set(L"and", 19); keywords.set(L"or", 20); tvalLength = 128; tval = new wchar_t[tvalLength]; // text of current token // COCO_HEAP_BLOCK_SIZE byte heap + pointer to next heap block heap = malloc(COCO_HEAP_BLOCK_SIZE + sizeof(void*)); firstHeap = heap; heapEnd = (void**) (((char*) heap) + COCO_HEAP_BLOCK_SIZE); *heapEnd = 0; heapTop = heap; if (sizeof(Token) > COCO_HEAP_BLOCK_SIZE) { wprintf(L"--- Too small COCO_HEAP_BLOCK_SIZE\n"); exit(1); } pos = -1; line = 1; col = 0; charPos = -1; oldEols = 0; NextCh(); if (ch == 0xEF) { // check optional byte order mark for UTF-8 NextCh(); int ch1 = ch; NextCh(); int ch2 = ch; if (ch1 != 0xBB || ch2 != 0xBF) { wprintf(L"Illegal byte order mark at start of file"); exit(1); } Buffer *oldBuf = buffer; buffer = new UTF8Buffer(buffer); col = 0; charPos = -1; delete oldBuf; oldBuf = NULL; NextCh(); } pt = tokens = CreateToken(); // first token is a dummy } void Scanner::NextCh() { if (oldEols > 0) { ch = EOL; oldEols--; } else { pos = buffer->GetPos(); // buffer reads unicode chars, if UTF8 has been detected ch = buffer->Read(); col++; charPos++; // replace isolated '\r' by '\n' in order to make // eol handling uniform across Windows, Unix and Mac if (ch == L'\r' && buffer->Peek() != L'\n') ch = EOL; if (ch == EOL) { line++; col = 0; } } } void Scanner::AddCh() { if (tlen >= tvalLength) { tvalLength *= 2; wchar_t *newBuf = new wchar_t[tvalLength]; memcpy(newBuf, tval, tlen*sizeof(wchar_t)); delete [] tval; tval = newBuf; } if (ch != Buffer::EoF) { tval[tlen++] = ch; NextCh(); } } void Scanner::CreateHeapBlock() { void* newHeap; char* cur = (char*) firstHeap; while(((char*) tokens < cur) || ((char*) tokens > (cur + COCO_HEAP_BLOCK_SIZE))) { cur = *((char**) (cur + COCO_HEAP_BLOCK_SIZE)); free(firstHeap); firstHeap = cur; } // COCO_HEAP_BLOCK_SIZE byte heap + pointer to next heap block newHeap = malloc(COCO_HEAP_BLOCK_SIZE + sizeof(void*)); *heapEnd = newHeap; heapEnd = (void**) (((char*) newHeap) + COCO_HEAP_BLOCK_SIZE); *heapEnd = 0; heap = newHeap; heapTop = heap; } Token* Scanner::CreateToken() { Token *t; if (((char*) heapTop + (int) sizeof(Token)) >= (char*) heapEnd) { CreateHeapBlock(); } t = (Token*) heapTop; heapTop = (void*) ((char*) heapTop + sizeof(Token)); t->val = NULL; t->next = NULL; return t; } void Scanner::AppendVal(Token *t) { int reqMem = (tlen + 1) * sizeof(wchar_t); if (((char*) heapTop + reqMem) >= (char*) heapEnd) { if (reqMem > COCO_HEAP_BLOCK_SIZE) { wprintf(L"--- Too long token value\n"); exit(1); } CreateHeapBlock(); } t->val = (wchar_t*) heapTop; heapTop = (void*) ((char*) heapTop + reqMem); wcsncpy(t->val, tval, tlen); t->val[tlen] = L'\0'; } Token* Scanner::NextToken() { while (ch == ' ' || false ) NextCh(); int recKind = noSym; int recEnd = pos; t = CreateToken(); t->pos = pos; t->col = col; t->line = line; t->charPos = charPos; int state = start.state(ch); tlen = 0; AddCh(); switch (state) { case -1: { t->kind = eofSym; break; } // NextCh already done case 0: { case_0: if (recKind != noSym) { tlen = recEnd - t->pos; SetScannerBehindT(); } t->kind = recKind; break; } // NextCh already done case 1: {t->kind = 1; break;} case 2: {t->kind = 2; break;} case 3: case_3: recEnd = pos; recKind = 3; if ((ch >= L'-' && ch <= L'.') || (ch >= L'A' && ch <= L'Z') || ch == L'_' || (ch >= L'a' && ch <= L'z')) {AddCh(); goto case_3;} else {t->kind = 3; wchar_t *literal = coco_string_create(tval, 0, tlen); t->kind = keywords.get(literal, t->kind); coco_string_delete(literal); break;} case 4: case_4: if (ch <= L'!' || (ch >= L'#' && ch <= 65535)) {AddCh(); goto case_4;} else if (ch == L'"') {AddCh(); goto case_5;} else {goto case_0;} case 5: case_5: {t->kind = 4; break;} case 6: case_6: recEnd = pos; recKind = 5; if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_6;} else {t->kind = 5; break;} case 7: case_7: if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_8;} else {goto case_0;} case 8: case_8: recEnd = pos; recKind = 6; if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_8;} else {t->kind = 6; break;} case 9: case_9: recEnd = pos; recKind = 5; if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_9;} else if (ch == L':') {AddCh(); goto case_7;} else {t->kind = 5; break;} case 10: recEnd = pos; recKind = 3; if ((ch >= L'-' && ch <= L'.') || (ch >= L'A' && ch <= L'Z') || ch == L'_' || (ch >= L'a' && ch <= L'z')) {AddCh(); goto case_3;} else if ((ch >= L'0' && ch <= L'9')) {AddCh(); goto case_6;} else {t->kind = 3; wchar_t *literal = coco_string_create(tval, 0, tlen); t->kind = keywords.get(literal, t->kind); coco_string_delete(literal); break;} case 11: case_11: {t->kind = 7; break;} case 12: case_12: {t->kind = 9; break;} case 13: case_13: {t->kind = 10; break;} case 14: case_14: {t->kind = 11; break;} case 15: case_15: {t->kind = 14; break;} case 16: case_16: {t->kind = 15; break;} case 17: {t->kind = 16; break;} case 18: case_18: {t->kind = 17; break;} case 19: recEnd = pos; recKind = 8; if (ch == L'=') {AddCh(); goto case_11;} else if (ch == L'~') {AddCh(); goto case_13;} else {t->kind = 8; break;} case 20: if (ch == L'=') {AddCh(); goto case_12;} else if (ch == L'~') {AddCh(); goto case_14;} else if (ch == L'#') {AddCh(); goto case_18;} else {goto case_0;} case 21: recEnd = pos; recKind = 12; if (ch == L'=') {AddCh(); goto case_15;} else {t->kind = 12; break;} case 22: recEnd = pos; recKind = 13; if (ch == L'=') {AddCh(); goto case_16;} else {t->kind = 13; break;} } AppendVal(t); return t; } void Scanner::SetScannerBehindT() { buffer->SetPos(t->pos); NextCh(); line = t->line; col = t->col; charPos = t->charPos; for (int i = 0; i < tlen; i++) NextCh(); } // get the next token (possibly a token already seen during peeking) Token* Scanner::Scan() { if (tokens->next == NULL) { return pt = tokens = NextToken(); } else { pt = tokens = tokens->next; return tokens; } } // peek for the next token, ignore pragmas Token* Scanner::Peek() { do { if (pt->next == NULL) { pt->next = NextToken(); } pt = pt->next; } while (pt->kind > maxT); // skip pragmas return pt; } // make sure that peeking starts at the current scan position void Scanner::ResetPeek() { pt = tokens; }