1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606 |
- /*
- www.sourceforge.net/projects/tinyxml
- Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
- This software is provided 'as-is', without any express or implied
- warranty. In no event will the authors be held liable for any
- damages arising from the use of this software.
- Permission is granted to anyone to use this software for any
- purpose, including commercial applications, and to alter it and
- redistribute it freely, subject to the following restrictions:
- 1. The origin of this software must not be misrepresented; you must
- not claim that you wrote the original software. If you use this
- software in a product, an acknowledgment in the product documentation
- would be appreciated but is not required.
- 2. Altered source versions must be plainly marked as such, and
- must not be misrepresented as being the original software.
- 3. This notice may not be removed or altered from any source
- distribution.
- */
- #include <ctype.h>
- #include <stddef.h>
- #include "tinyxml.h"
- //#define DEBUG_PARSER
- #if defined( DEBUG_PARSER )
- # if defined( DEBUG ) && defined( _MSC_VER )
- # include <windows.h>
- # define TIXML_LOG OutputDebugString
- # else
- # define TIXML_LOG printf
- # endif
- #endif
- // Note tha "PutString" hardcodes the same list. This
- // is less flexible than it appears. Changing the entries
- // or order will break putstring.
- TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
- {
- { "&", 5, '&' },
- { "<", 4, '<' },
- { ">", 4, '>' },
- { """, 6, '\"' },
- { "'", 6, '\'' }
- };
- // Bunch of unicode info at:
- // http://www.unicode.org/faq/utf_bom.html
- // Including the basic of this table, which determines the #bytes in the
- // sequence from the lead byte. 1 placed for invalid sequences --
- // although the result will be junk, pass it through as much as possible.
- // Beware of the non-characters in UTF-8:
- // ef bb bf (Microsoft "lead bytes")
- // ef bf be
- // ef bf bf
- const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
- const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
- const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
- const int TiXmlBase::utf8ByteTable[256] =
- {
- // 0 1 2 3 4 5 6 7 8 9 a b c d e f
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0
- 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte
- 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
- };
- void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
- {
- const unsigned long BYTE_MASK = 0xBF;
- const unsigned long BYTE_MARK = 0x80;
- const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
- if (input < 0x80)
- *length = 1;
- else if ( input < 0x800 )
- *length = 2;
- else if ( input < 0x10000 )
- *length = 3;
- else if ( input < 0x200000 )
- *length = 4;
- else
- { *length = 0; return; } // This code won't covert this correctly anyway.
- output += *length;
- // Scary scary fall throughs.
- switch (*length)
- {
- case 4:
- --output;
- *output = (char)((input | BYTE_MARK) & BYTE_MASK);
- input >>= 6;
- case 3:
- --output;
- *output = (char)((input | BYTE_MARK) & BYTE_MASK);
- input >>= 6;
- case 2:
- --output;
- *output = (char)((input | BYTE_MARK) & BYTE_MASK);
- input >>= 6;
- case 1:
- --output;
- *output = (char)(input | FIRST_BYTE_MARK[*length]);
- }
- }
- /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
- {
- // This will only work for low-ascii, everything else is assumed to be a valid
- // letter. I'm not sure this is the best approach, but it is quite tricky trying
- // to figure out alhabetical vs. not across encoding. So take a very
- // conservative approach.
- // if ( encoding == TIXML_ENCODING_UTF8 )
- // {
- if ( anyByte < 127 )
- return isalpha( anyByte );
- else
- return 1; // What else to do? The unicode set is huge...get the english ones right.
- // }
- // else
- // {
- // return isalpha( anyByte );
- // }
- }
- /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
- {
- // This will only work for low-ascii, everything else is assumed to be a valid
- // letter. I'm not sure this is the best approach, but it is quite tricky trying
- // to figure out alhabetical vs. not across encoding. So take a very
- // conservative approach.
- // if ( encoding == TIXML_ENCODING_UTF8 )
- // {
- if ( anyByte < 127 )
- return isalnum( anyByte );
- else
- return 1; // What else to do? The unicode set is huge...get the english ones right.
- // }
- // else
- // {
- // return isalnum( anyByte );
- // }
- }
- class TiXmlParsingData
- {
- friend class TiXmlDocument;
- public:
- void Stamp( const char* now, TiXmlEncoding encoding );
- const TiXmlCursor& Cursor() { return cursor; }
- private:
- // Only used by the document!
- TiXmlParsingData( const char* start, int _tabsize, int row, int col )
- {
- assert( start );
- stamp = start;
- tabsize = _tabsize;
- cursor.row = row;
- cursor.col = col;
- }
- TiXmlCursor cursor;
- const char* stamp;
- int tabsize;
- };
- void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
- {
- assert( now );
- // Do nothing if the tabsize is 0.
- if ( tabsize < 1 )
- {
- return;
- }
- // Get the current row, column.
- int row = cursor.row;
- int col = cursor.col;
- const char* p = stamp;
- assert( p );
- while ( p < now )
- {
- // Treat p as unsigned, so we have a happy compiler.
- const unsigned char* pU = (const unsigned char*)p;
- // Code contributed by Fletcher Dunn: (modified by lee)
- switch (*pU) {
- case 0:
- // We *should* never get here, but in case we do, don't
- // advance past the terminating null character, ever
- return;
- case '\r':
- // bump down to the next line
- ++row;
- col = 0;
- // Eat the character
- ++p;
- // Check for \r\n sequence, and treat this as a single character
- if (*p == '\n') {
- ++p;
- }
- break;
- case '\n':
- // bump down to the next line
- ++row;
- col = 0;
- // Eat the character
- ++p;
- // Check for \n\r sequence, and treat this as a single
- // character. (Yes, this bizarre thing does occur still
- // on some arcane platforms...)
- if (*p == '\r') {
- ++p;
- }
- break;
- case '\t':
- // Eat the character
- ++p;
- // Skip to next tab stop
- col = (col / tabsize + 1) * tabsize;
- break;
- case TIXML_UTF_LEAD_0:
- if ( encoding == TIXML_ENCODING_UTF8 )
- {
- if ( *(p+1) && *(p+2) )
- {
- // In these cases, don't advance the column. These are
- // 0-width spaces.
- if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
- p += 3;
- else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
- p += 3;
- else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
- p += 3;
- else
- { p +=3; ++col; } // A normal character.
- }
- }
- else
- {
- ++p;
- ++col;
- }
- break;
- default:
- if ( encoding == TIXML_ENCODING_UTF8 )
- {
- // Eat the 1 to 4 byte utf8 character.
- int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
- if ( step == 0 )
- step = 1; // Error case from bad encoding, but handle gracefully.
- p += step;
- // Just advance one column, of course.
- ++col;
- }
- else
- {
- ++p;
- ++col;
- }
- break;
- }
- }
- cursor.row = row;
- cursor.col = col;
- assert( cursor.row >= -1 );
- assert( cursor.col >= -1 );
- stamp = p;
- assert( stamp );
- }
- const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
- {
- if ( !p || !*p )
- {
- return 0;
- }
- if ( encoding == TIXML_ENCODING_UTF8 )
- {
- while ( *p )
- {
- const unsigned char* pU = (const unsigned char*)p;
-
- // Skip the stupid Microsoft UTF-8 Byte order marks
- if ( *(pU+0)==TIXML_UTF_LEAD_0
- && *(pU+1)==TIXML_UTF_LEAD_1
- && *(pU+2)==TIXML_UTF_LEAD_2 )
- {
- p += 3;
- continue;
- }
- else if(*(pU+0)==TIXML_UTF_LEAD_0
- && *(pU+1)==0xbfU
- && *(pU+2)==0xbeU )
- {
- p += 3;
- continue;
- }
- else if(*(pU+0)==TIXML_UTF_LEAD_0
- && *(pU+1)==0xbfU
- && *(pU+2)==0xbfU )
- {
- p += 3;
- continue;
- }
- if ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' ) // Still using old rules for white space.
- ++p;
- else
- break;
- }
- }
- else
- {
- while ( *p && IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )
- ++p;
- }
- return p;
- }
- #ifdef TIXML_USE_STL
- /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
- {
- for( ;; )
- {
- if ( !in->good() ) return false;
- int c = in->peek();
- // At this scope, we can't get to a document. So fail silently.
- if ( !IsWhiteSpace( c ) || c <= 0 )
- return true;
- *tag += (char) in->get();
- }
- }
- /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
- {
- //assert( character > 0 && character < 128 ); // else it won't work in utf-8
- while ( in->good() )
- {
- int c = in->peek();
- if ( c == character )
- return true;
- if ( c <= 0 ) // Silent failure: can't get document at this scope
- return false;
- in->get();
- *tag += (char) c;
- }
- return false;
- }
- #endif
- // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
- // "assign" optimization removes over 10% of the execution time.
- //
- const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
- {
- // Oddly, not supported on some comilers,
- //name->clear();
- // So use this:
- *name = "";
- assert( p );
- // Names start with letters or underscores.
- // Of course, in unicode, tinyxml has no idea what a letter *is*. The
- // algorithm is generous.
- //
- // After that, they can be letters, underscores, numbers,
- // hyphens, or colons. (Colons are valid ony for namespaces,
- // but tinyxml can't tell namespaces from names.)
- if ( p && *p
- && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
- {
- const char* start = p;
- while( p && *p
- && ( IsAlphaNum( (unsigned char ) *p, encoding )
- || *p == '_'
- || *p == '-'
- || *p == '.'
- || *p == ':' ) )
- {
- //(*name) += *p; // expensive
- ++p;
- }
- if ( p-start > 0 ) {
- name->assign( start, p-start );
- }
- return p;
- }
- return 0;
- }
- const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
- {
- // Presume an entity, and pull it out.
- TIXML_STRING ent;
- int i;
- *length = 0;
- if ( *(p+1) && *(p+1) == '#' && *(p+2) )
- {
- unsigned long ucs = 0;
- ptrdiff_t delta = 0;
- unsigned mult = 1;
- if ( *(p+2) == 'x' )
- {
- // Hexadecimal.
- if ( !*(p+3) ) return 0;
- const char* q = p+3;
- q = strchr( q, ';' );
- if ( !q || !*q ) return 0;
- delta = q-p;
- --q;
- while ( *q != 'x' )
- {
- if ( *q >= '0' && *q <= '9' )
- ucs += mult * (*q - '0');
- else if ( *q >= 'a' && *q <= 'f' )
- ucs += mult * (*q - 'a' + 10);
- else if ( *q >= 'A' && *q <= 'F' )
- ucs += mult * (*q - 'A' + 10 );
- else
- return 0;
- mult *= 16;
- --q;
- }
- }
- else
- {
- // Decimal.
- if ( !*(p+2) ) return 0;
- const char* q = p+2;
- q = strchr( q, ';' );
- if ( !q || !*q ) return 0;
- delta = q-p;
- --q;
- while ( *q != '#' )
- {
- if ( *q >= '0' && *q <= '9' )
- ucs += mult * (*q - '0');
- else
- return 0;
- mult *= 10;
- --q;
- }
- }
- if ( encoding == TIXML_ENCODING_UTF8 )
- {
- // convert the UCS to UTF-8
- ConvertUTF32ToUTF8( ucs, value, length );
- }
- else
- {
- *value = (char)ucs;
- *length = 1;
- }
- return p + delta + 1;
- }
- // Now try to match it.
- for( i=0; i<NUM_ENTITY; ++i )
- {
- if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
- {
- assert( strlen( entity[i].str ) == entity[i].strLength );
- *value = entity[i].chr;
- *length = 1;
- return ( p + entity[i].strLength );
- }
- }
- // So it wasn't an entity, its unrecognized, or something like that.
- *value = *p; // Don't put back the last one, since we return it!
- //*length = 1; // Leave unrecognized entities - this doesn't really work.
- // Just writes strange XML.
- return p+1;
- }
- bool TiXmlBase::StringEqual( const char* p,
- const char* tag,
- bool ignoreCase,
- TiXmlEncoding encoding )
- {
- assert( p );
- assert( tag );
- if ( !p || !*p )
- {
- assert( 0 );
- return false;
- }
- const char* q = p;
- if ( ignoreCase )
- {
- while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
- {
- ++q;
- ++tag;
- }
- if ( *tag == 0 )
- return true;
- }
- else
- {
- while ( *q && *tag && *q == *tag )
- {
- ++q;
- ++tag;
- }
- if ( *tag == 0 ) // Have we found the end of the tag, and everything equal?
- return true;
- }
- return false;
- }
- const char* TiXmlBase::ReadText( const char* p,
- TIXML_STRING * text,
- bool trimWhiteSpace,
- const char* endTag,
- bool caseInsensitive,
- TiXmlEncoding encoding )
- {
- *text = "";
- if ( !trimWhiteSpace // certain tags always keep whitespace
- || !condenseWhiteSpace ) // if true, whitespace is always kept
- {
- // Keep all the white space.
- while ( p && *p
- && !StringEqual( p, endTag, caseInsensitive, encoding )
- )
- {
- int len;
- char cArr[4] = { 0, 0, 0, 0 };
- p = GetChar( p, cArr, &len, encoding );
- text->append( cArr, len );
- }
- }
- else
- {
- bool whitespace = false;
- // Remove leading white space:
- p = SkipWhiteSpace( p, encoding );
- while ( p && *p
- && !StringEqual( p, endTag, caseInsensitive, encoding ) )
- {
- if ( *p == '\r' || *p == '\n' )
- {
- whitespace = true;
- ++p;
- }
- else if ( IsWhiteSpace( *p ) )
- {
- whitespace = true;
- ++p;
- }
- else
- {
- // If we've found whitespace, add it before the
- // new character. Any whitespace just becomes a space.
- if ( whitespace )
- {
- (*text) += ' ';
- whitespace = false;
- }
- int len;
- char cArr[4] = { 0, 0, 0, 0 };
- p = GetChar( p, cArr, &len, encoding );
- if ( len == 1 )
- (*text) += cArr[0]; // more efficient
- else
- text->append( cArr, len );
- }
- }
- }
- if ( p )
- p += strlen( endTag );
- return p;
- }
- #ifdef TIXML_USE_STL
- void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
- {
- // The basic issue with a document is that we don't know what we're
- // streaming. Read something presumed to be a tag (and hope), then
- // identify it, and call the appropriate stream method on the tag.
- //
- // This "pre-streaming" will never read the closing ">" so the
- // sub-tag can orient itself.
- if ( !StreamTo( in, '<', tag ) )
- {
- SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- while ( in->good() )
- {
- int tagIndex = (int) tag->length();
- while ( in->good() && in->peek() != '>' )
- {
- int c = in->get();
- if ( c <= 0 )
- {
- SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- break;
- }
- (*tag) += (char) c;
- }
- if ( in->good() )
- {
- // We now have something we presume to be a node of
- // some sort. Identify it, and call the node to
- // continue streaming.
- TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
- if ( node )
- {
- node->StreamIn( in, tag );
- bool isElement = node->ToElement() != 0;
- delete node;
- node = 0;
- // If this is the root element, we're done. Parsing will be
- // done by the >> operator.
- if ( isElement )
- {
- return;
- }
- }
- else
- {
- SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- }
- }
- // We should have returned sooner.
- SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
- }
- #endif
- const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
- {
- ClearError();
- // Parse away, at the document level. Since a document
- // contains nothing but other tags, most of what happens
- // here is skipping white space.
- if ( !p || !*p )
- {
- SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
- return 0;
- }
- // Note that, for a document, this needs to come
- // before the while space skip, so that parsing
- // starts from the pointer we are given.
- location.Clear();
- if ( prevData )
- {
- location.row = prevData->cursor.row;
- location.col = prevData->cursor.col;
- }
- else
- {
- location.row = 0;
- location.col = 0;
- }
- TiXmlParsingData data( p, TabSize(), location.row, location.col );
- location = data.Cursor();
- if ( encoding == TIXML_ENCODING_UNKNOWN )
- {
- // Check for the Microsoft UTF-8 lead bytes.
- const unsigned char* pU = (const unsigned char*)p;
- if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
- && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
- && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
- {
- encoding = TIXML_ENCODING_UTF8;
- useMicrosoftBOM = true;
- }
- }
- p = SkipWhiteSpace( p, encoding );
- if ( !p )
- {
- SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
- return 0;
- }
- while ( p && *p )
- {
- TiXmlNode* node = Identify( p, encoding );
- if ( node )
- {
- p = node->Parse( p, &data, encoding );
- LinkEndChild( node );
- }
- else
- {
- break;
- }
- // Did we get encoding info?
- if ( encoding == TIXML_ENCODING_UNKNOWN
- && node->ToDeclaration() )
- {
- TiXmlDeclaration* dec = node->ToDeclaration();
- const char* enc = dec->Encoding();
- assert( enc );
- if ( *enc == 0 )
- encoding = TIXML_ENCODING_UTF8;
- else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
- encoding = TIXML_ENCODING_UTF8;
- else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
- encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
- else
- encoding = TIXML_ENCODING_LEGACY;
- }
- p = SkipWhiteSpace( p, encoding );
- }
- // Was this empty?
- if ( !firstChild ) {
- SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
- return 0;
- }
- // All is well.
- return p;
- }
- void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- // The first error in a chain is more accurate - don't set again!
- if ( error )
- return;
- assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
- error = true;
- errorId = err;
- errorDesc = errorString[ errorId ];
- errorLocation.Clear();
- if ( pError && data )
- {
- data->Stamp( pError, encoding );
- errorLocation = data->Cursor();
- }
- }
- TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
- {
- TiXmlNode* returnNode = 0;
- p = SkipWhiteSpace( p, encoding );
- if( !p || !*p || *p != '<' )
- {
- return 0;
- }
- TiXmlDocument* doc = GetDocument();
- p = SkipWhiteSpace( p, encoding );
- if ( !p || !*p )
- {
- return 0;
- }
- // What is this thing?
- // - Elements start with a letter or underscore, but xml is reserved.
- // - Comments: <!--
- // - Decleration: <?xml
- // - Everthing else is unknown to tinyxml.
- //
- const char* xmlHeader = { "<?xml" };
- const char* commentHeader = { "<!--" };
- const char* dtdHeader = { "<!" };
- const char* cdataHeader = { "<![CDATA[" };
- if ( StringEqual( p, xmlHeader, true, encoding ) )
- {
- #ifdef DEBUG_PARSER
- TIXML_LOG( "XML parsing Declaration\n" );
- #endif
- returnNode = new TiXmlDeclaration();
- }
- else if ( StringEqual( p, commentHeader, false, encoding ) )
- {
- #ifdef DEBUG_PARSER
- TIXML_LOG( "XML parsing Comment\n" );
- #endif
- returnNode = new TiXmlComment();
- }
- else if ( StringEqual( p, cdataHeader, false, encoding ) )
- {
- #ifdef DEBUG_PARSER
- TIXML_LOG( "XML parsing CDATA\n" );
- #endif
- TiXmlText* text = new TiXmlText( "" );
- text->SetCDATA( true );
- returnNode = text;
- }
- else if ( StringEqual( p, dtdHeader, false, encoding ) )
- {
- #ifdef DEBUG_PARSER
- TIXML_LOG( "XML parsing Unknown(1)\n" );
- #endif
- returnNode = new TiXmlUnknown();
- }
- else if ( IsAlpha( *(p+1), encoding )
- || *(p+1) == '_' )
- {
- #ifdef DEBUG_PARSER
- TIXML_LOG( "XML parsing Element\n" );
- #endif
- returnNode = new TiXmlElement( "" );
- }
- else
- {
- #ifdef DEBUG_PARSER
- TIXML_LOG( "XML parsing Unknown(2)\n" );
- #endif
- returnNode = new TiXmlUnknown();
- }
- if ( returnNode )
- {
- // Set the parent, so it can report errors
- returnNode->parent = this;
- }
- else
- {
- if ( doc )
- doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
- }
- return returnNode;
- }
- #ifdef TIXML_USE_STL
- void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
- {
- // We're called with some amount of pre-parsing. That is, some of "this"
- // element is in "tag". Go ahead and stream to the closing ">"
- while( in->good() )
- {
- int c = in->get();
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- (*tag) += (char) c ;
-
- if ( c == '>' )
- break;
- }
- if ( tag->length() < 3 ) return;
- // Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
- // If not, identify and stream.
- if ( tag->at( tag->length() - 1 ) == '>'
- && tag->at( tag->length() - 2 ) == '/' )
- {
- // All good!
- return;
- }
- else if ( tag->at( tag->length() - 1 ) == '>' )
- {
- // There is more. Could be:
- // text
- // cdata text (which looks like another node)
- // closing tag
- // another node.
- for ( ;; )
- {
- StreamWhiteSpace( in, tag );
- // Do we have text?
- if ( in->good() && in->peek() != '<' )
- {
- // Yep, text.
- TiXmlText text( "" );
- text.StreamIn( in, tag );
- // What follows text is a closing tag or another node.
- // Go around again and figure it out.
- continue;
- }
- // We now have either a closing tag...or another node.
- // We should be at a "<", regardless.
- if ( !in->good() ) return;
- assert( in->peek() == '<' );
- int tagIndex = (int) tag->length();
- bool closingTag = false;
- bool firstCharFound = false;
- for( ;; )
- {
- if ( !in->good() )
- return;
- int c = in->peek();
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
-
- if ( c == '>' )
- break;
- *tag += (char) c;
- in->get();
- // Early out if we find the CDATA id.
- if ( c == '[' && tag->size() >= 9 )
- {
- size_t len = tag->size();
- const char* start = tag->c_str() + len - 9;
- if ( strcmp( start, "<![CDATA[" ) == 0 ) {
- assert( !closingTag );
- break;
- }
- }
- if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
- {
- firstCharFound = true;
- if ( c == '/' )
- closingTag = true;
- }
- }
- // If it was a closing tag, then read in the closing '>' to clean up the input stream.
- // If it was not, the streaming will be done by the tag.
- if ( closingTag )
- {
- if ( !in->good() )
- return;
- int c = in->get();
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- assert( c == '>' );
- *tag += (char) c;
- // We are done, once we've found our closing tag.
- return;
- }
- else
- {
- // If not a closing tag, id it, and stream.
- const char* tagloc = tag->c_str() + tagIndex;
- TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
- if ( !node )
- return;
- node->StreamIn( in, tag );
- delete node;
- node = 0;
- // No return: go around from the beginning: text, closing tag, or node.
- }
- }
- }
- }
- #endif
- const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- p = SkipWhiteSpace( p, encoding );
- TiXmlDocument* document = GetDocument();
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
- return 0;
- }
- if ( data )
- {
- data->Stamp( p, encoding );
- location = data->Cursor();
- }
- if ( *p != '<' )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
- return 0;
- }
- p = SkipWhiteSpace( p+1, encoding );
- // Read the name.
- const char* pErr = p;
- p = ReadName( p, &value, encoding );
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
- return 0;
- }
- TIXML_STRING endTag ("</");
- endTag += value;
- endTag += ">";
- // Check for and read attributes. Also look for an empty
- // tag or an end tag.
- while ( p && *p )
- {
- pErr = p;
- p = SkipWhiteSpace( p, encoding );
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
- return 0;
- }
- if ( *p == '/' )
- {
- ++p;
- // Empty tag.
- if ( *p != '>' )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
- return 0;
- }
- return (p+1);
- }
- else if ( *p == '>' )
- {
- // Done with attributes (if there were any.)
- // Read the value -- which can include other
- // elements -- read the end tag, and return.
- ++p;
- p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens.
- if ( !p || !*p )
- return 0;
- // We should find the end tag now
- if ( StringEqual( p, endTag.c_str(), false, encoding ) )
- {
- p += endTag.length();
- return p;
- }
- else
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
- return 0;
- }
- }
- else
- {
- // Try to read an attribute:
- TiXmlAttribute* attrib = new TiXmlAttribute();
- if ( !attrib )
- {
- if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
- return 0;
- }
- attrib->SetDocument( document );
- pErr = p;
- p = attrib->Parse( p, data, encoding );
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
- delete attrib;
- return 0;
- }
- // Handle the strange case of double attributes:
- #ifdef TIXML_USE_STL
- TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
- #else
- TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
- #endif
- if ( node )
- {
- node->SetValue( attrib->Value() );
- delete attrib;
- return 0;
- }
- attributeSet.Add( attrib );
- }
- }
- return p;
- }
- const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- TiXmlDocument* document = GetDocument();
- // Read in text and elements in any order.
- const char* pWithWhiteSpace = p;
- p = SkipWhiteSpace( p, encoding );
- while ( p && *p )
- {
- if ( *p != '<' )
- {
- // Take what we have, make a text element.
- TiXmlText* textNode = new TiXmlText( "" );
- if ( !textNode )
- {
- if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
- return 0;
- }
- if ( TiXmlBase::IsWhiteSpaceCondensed() )
- {
- p = textNode->Parse( p, data, encoding );
- }
- else
- {
- // Special case: we want to keep the white space
- // so that leading spaces aren't removed.
- p = textNode->Parse( pWithWhiteSpace, data, encoding );
- }
- if ( !textNode->Blank() )
- LinkEndChild( textNode );
- else
- delete textNode;
- }
- else
- {
- // We hit a '<'
- // Have we hit a new element or an end tag? This could also be
- // a TiXmlText in the "CDATA" style.
- if ( StringEqual( p, "</", false, encoding ) )
- {
- return p;
- }
- else
- {
- TiXmlNode* node = Identify( p, encoding );
- if ( node )
- {
- p = node->Parse( p, data, encoding );
- LinkEndChild( node );
- }
- else
- {
- return 0;
- }
- }
- }
- pWithWhiteSpace = p;
- p = SkipWhiteSpace( p, encoding );
- }
- if ( !p )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
- }
- return p;
- }
- #ifdef TIXML_USE_STL
- void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
- {
- while ( in->good() )
- {
- int c = in->get();
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- (*tag) += (char) c;
- if ( c == '>' )
- {
- // All is well.
- return;
- }
- }
- }
- #endif
- const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- TiXmlDocument* document = GetDocument();
- p = SkipWhiteSpace( p, encoding );
- if ( data )
- {
- data->Stamp( p, encoding );
- location = data->Cursor();
- }
- if ( !p || !*p || *p != '<' )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
- return 0;
- }
- ++p;
- value = "";
- while ( p && *p && *p != '>' )
- {
- value += *p;
- ++p;
- }
- if ( !p )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
- }
- if ( *p == '>' )
- return p+1;
- return p;
- }
- #ifdef TIXML_USE_STL
- void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
- {
- while ( in->good() )
- {
- int c = in->get();
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- (*tag) += (char) c;
- if ( c == '>'
- && tag->at( tag->length() - 2 ) == '-'
- && tag->at( tag->length() - 3 ) == '-' )
- {
- // All is well.
- return;
- }
- }
- }
- #endif
- const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- TiXmlDocument* document = GetDocument();
- value = "";
- p = SkipWhiteSpace( p, encoding );
- if ( data )
- {
- data->Stamp( p, encoding );
- location = data->Cursor();
- }
- const char* startTag = "<!--";
- const char* endTag = "-->";
- if ( !StringEqual( p, startTag, false, encoding ) )
- {
- document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
- return 0;
- }
- p += strlen( startTag );
- p = ReadText( p, &value, false, endTag, false, encoding );
- return p;
- }
- const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- p = SkipWhiteSpace( p, encoding );
- if ( !p || !*p ) return 0;
- // int tabsize = 4;
- // if ( document )
- // tabsize = document->TabSize();
- if ( data )
- {
- data->Stamp( p, encoding );
- location = data->Cursor();
- }
- // Read the name, the '=' and the value.
- const char* pErr = p;
- p = ReadName( p, &name, encoding );
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
- return 0;
- }
- p = SkipWhiteSpace( p, encoding );
- if ( !p || !*p || *p != '=' )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
- return 0;
- }
- ++p; // skip '='
- p = SkipWhiteSpace( p, encoding );
- if ( !p || !*p )
- {
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
- return 0;
- }
-
- const char* end;
- const char SINGLE_QUOTE = '\'';
- const char DOUBLE_QUOTE = '\"';
- if ( *p == SINGLE_QUOTE )
- {
- ++p;
- end = "\'"; // single quote in string
- p = ReadText( p, &value, false, end, false, encoding );
- }
- else if ( *p == DOUBLE_QUOTE )
- {
- ++p;
- end = "\""; // double quote in string
- p = ReadText( p, &value, false, end, false, encoding );
- }
- else
- {
- // All attribute values should be in single or double quotes.
- // But this is such a common error that the parser will try
- // its best, even without them.
- value = "";
- while ( p && *p // existence
- && !IsWhiteSpace( *p ) && *p != '\n' && *p != '\r' // whitespace
- && *p != '/' && *p != '>' ) // tag end
- {
- if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
- // [ 1451649 ] Attribute values with trailing quotes not handled correctly
- // We did not have an opening quote but seem to have a
- // closing one. Give up and throw an error.
- if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
- return 0;
- }
- value += *p;
- ++p;
- }
- }
- return p;
- }
- #ifdef TIXML_USE_STL
- void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
- {
- while ( in->good() )
- {
- int c = in->peek();
- if ( !cdata && (c == '<' ) )
- {
- return;
- }
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- (*tag) += (char) c;
- in->get(); // "commits" the peek made above
- if ( cdata && c == '>' && tag->size() >= 3 ) {
- size_t len = tag->size();
- if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
- // terminator of cdata.
- return;
- }
- }
- }
- }
- #endif
- const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
- {
- value = "";
- TiXmlDocument* document = GetDocument();
- if ( data )
- {
- data->Stamp( p, encoding );
- location = data->Cursor();
- }
- const char* const startTag = "<![CDATA[";
- const char* const endTag = "]]>";
- if ( cdata || StringEqual( p, startTag, false, encoding ) )
- {
- cdata = true;
- if ( !StringEqual( p, startTag, false, encoding ) )
- {
- document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
- return 0;
- }
- p += strlen( startTag );
- // Keep all the white space, ignore the encoding, etc.
- while ( p && *p
- && !StringEqual( p, endTag, false, encoding )
- )
- {
- value += *p;
- ++p;
- }
- TIXML_STRING dummy;
- p = ReadText( p, &dummy, false, endTag, false, encoding );
- return p;
- }
- else
- {
- bool ignoreWhite = true;
- const char* end = "<";
- p = ReadText( p, &value, ignoreWhite, end, false, encoding );
- if ( p )
- return p-1; // don't truncate the '<'
- return 0;
- }
- }
- #ifdef TIXML_USE_STL
- void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
- {
- while ( in->good() )
- {
- int c = in->get();
- if ( c <= 0 )
- {
- TiXmlDocument* document = GetDocument();
- if ( document )
- document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
- return;
- }
- (*tag) += (char) c;
- if ( c == '>' )
- {
- // All is well.
- return;
- }
- }
- }
- #endif
- const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
- {
- p = SkipWhiteSpace( p, _encoding );
- // Find the beginning, find the end, and look for
- // the stuff in-between.
- TiXmlDocument* document = GetDocument();
- if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
- {
- if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
- return 0;
- }
- if ( data )
- {
- data->Stamp( p, _encoding );
- location = data->Cursor();
- }
- p += 5;
- version = "";
- encoding = "";
- standalone = "";
- while ( p && *p )
- {
- if ( *p == '>' )
- {
- ++p;
- return p;
- }
- p = SkipWhiteSpace( p, _encoding );
- if ( StringEqual( p, "version", true, _encoding ) )
- {
- TiXmlAttribute attrib;
- p = attrib.Parse( p, data, _encoding );
- version = attrib.Value();
- }
- else if ( StringEqual( p, "encoding", true, _encoding ) )
- {
- TiXmlAttribute attrib;
- p = attrib.Parse( p, data, _encoding );
- encoding = attrib.Value();
- }
- else if ( StringEqual( p, "standalone", true, _encoding ) )
- {
- TiXmlAttribute attrib;
- p = attrib.Parse( p, data, _encoding );
- standalone = attrib.Value();
- }
- else
- {
- // Read over whatever it is.
- while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
- ++p;
- }
- }
- return 0;
- }
- bool TiXmlText::Blank() const
- {
- for ( unsigned i=0; i<value.length(); i++ )
- if ( !IsWhiteSpace( value[i] ) )
- return false;
- return true;
- }
|