@@ -141,6 +141,7 @@ static int parse_xml_decl(const xmlChar *str, size_t *lenp,
141141 xmlChar * * version , xmlChar * * encoding , int * standalone );
142142static bool print_xml_decl (StringInfo buf , const xmlChar * version ,
143143 pg_enc encoding , int standalone );
144+ static bool xml_doctype_in_content (const xmlChar * str );
144145static xmlDocPtr xml_parse (text * data , XmlOptionType xmloption_arg ,
145146 bool preserve_whitespace , int encoding );
146147static text * xml_xmlnodetoxmltype (xmlNodePtr cur , PgXmlErrorContext * xmlerrcxt );
@@ -1243,8 +1244,15 @@ parse_xml_decl(const xmlChar *str, size_t *lenp,
12431244 if (xmlStrncmp (p , (xmlChar * ) "<?xml" , 5 ) != 0 )
12441245 goto finished ;
12451246
1246- /* if next char is name char, it's a PI like <?xml-stylesheet ...?> */
1247- utf8len = strlen ((const char * ) (p + 5 ));
1247+ /*
1248+ * If next char is a name char, it's a PI like <?xml-stylesheet ...?>
1249+ * rather than an XMLDecl, so we have done what we came to do and found no
1250+ * XMLDecl.
1251+ *
1252+ * We need an input length value for xmlGetUTF8Char, but there's no need
1253+ * to count the whole document size, so use strnlen not strlen.
1254+ */
1255+ utf8len = strnlen ((const char * ) (p + 5 ), MAX_MULTIBYTE_CHAR_LEN );
12481256 utf8char = xmlGetUTF8Char (p + 5 , & utf8len );
12491257 if (PG_XMLISNAMECHAR (utf8char ))
12501258 goto finished ;
@@ -1415,6 +1423,88 @@ print_xml_decl(StringInfo buf, const xmlChar *version,
14151423 return false;
14161424}
14171425
1426+ /*
1427+ * Test whether an input that is to be parsed as CONTENT contains a DTD.
1428+ *
1429+ * The SQL/XML:2003 definition of CONTENT ("XMLDecl? content") is not
1430+ * satisfied by a document with a DTD, which is a bit of a wart, as it means
1431+ * the CONTENT type is not a proper superset of DOCUMENT. SQL/XML:2006 and
1432+ * later fix that, by redefining content with reference to the "more
1433+ * permissive" Document Node of the XQuery/XPath Data Model, such that any
1434+ * DOCUMENT value is indeed also a CONTENT value. That definition is more
1435+ * useful, as CONTENT becomes usable for parsing input of unknown form (think
1436+ * pg_restore).
1437+ *
1438+ * As used below in parse_xml when parsing for CONTENT, libxml does not give
1439+ * us the 2006+ behavior, but only the 2003; it will choke if the input has
1440+ * a DTD. But we can provide the 2006+ definition of CONTENT easily enough,
1441+ * by detecting this case first and simply doing the parse as DOCUMENT.
1442+ *
1443+ * A DTD can be found arbitrarily far in, but that would be a contrived case;
1444+ * it will ordinarily start within a few dozen characters. The only things
1445+ * that can precede it are an XMLDecl (here, the caller will have called
1446+ * parse_xml_decl already), whitespace, comments, and processing instructions.
1447+ * This function need only return true if it sees a valid sequence of such
1448+ * things leading to <!DOCTYPE. It can simply return false in any other
1449+ * cases, including malformed input; that will mean the input gets parsed as
1450+ * CONTENT as originally planned, with libxml reporting any errors.
1451+ *
1452+ * This is only to be called from xml_parse, when pg_xml_init has already
1453+ * been called. The input is already in UTF8 encoding.
1454+ */
1455+ static bool
1456+ xml_doctype_in_content (const xmlChar * str )
1457+ {
1458+ const xmlChar * p = str ;
1459+
1460+ for (;;)
1461+ {
1462+ const xmlChar * e ;
1463+
1464+ SKIP_XML_SPACE (p );
1465+ if (* p != '<' )
1466+ return false;
1467+ p ++ ;
1468+
1469+ if (* p == '!' )
1470+ {
1471+ p ++ ;
1472+
1473+ /* if we see <!DOCTYPE, we can return true */
1474+ if (xmlStrncmp (p , (xmlChar * ) "DOCTYPE" , 7 ) == 0 )
1475+ return true;
1476+
1477+ /* otherwise, if it's not a comment, fail */
1478+ if (xmlStrncmp (p , (xmlChar * ) "--" , 2 ) != 0 )
1479+ return false;
1480+ /* find end of comment: find -- and a > must follow */
1481+ p = xmlStrstr (p + 2 , (xmlChar * ) "--" );
1482+ if (!p || p [2 ] != '>' )
1483+ return false;
1484+ /* advance over comment, and keep scanning */
1485+ p += 3 ;
1486+ continue ;
1487+ }
1488+
1489+ /* otherwise, if it's not a PI <?target something?>, fail */
1490+ if (* p != '?' )
1491+ return false;
1492+ p ++ ;
1493+
1494+ /* find end of PI (the string ?> is forbidden within a PI) */
1495+ e = xmlStrstr (p , (xmlChar * ) "?>" );
1496+ if (!e )
1497+ return false;
1498+
1499+ /* we don't check PIs carefully, but do reject "xml" target */
1500+ if (e - p >= 3 && xmlStrncasecmp (p , (xmlChar * ) "xml" , 3 ) == 0 )
1501+ return false;
1502+
1503+ /* advance over PI, keep scanning */
1504+ p = e + 2 ;
1505+ }
1506+ }
1507+
14181508
14191509/*
14201510 * Convert a C string to XML internal representation
@@ -1450,14 +1540,38 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace,
14501540 /* Use a TRY block to ensure we clean up correctly */
14511541 PG_TRY ();
14521542 {
1543+ bool parse_as_document = false;
1544+ int res_code ;
1545+ size_t count = 0 ;
1546+ xmlChar * version = NULL ;
1547+ int standalone = 0 ;
1548+
14531549 xmlInitParser ();
14541550
14551551 ctxt = xmlNewParserCtxt ();
14561552 if (ctxt == NULL || xmlerrcxt -> err_occurred )
14571553 xml_ereport (xmlerrcxt , ERROR , ERRCODE_OUT_OF_MEMORY ,
14581554 "could not allocate parser context" );
14591555
1556+ /* Decide whether to parse as document or content */
14601557 if (xmloption_arg == XMLOPTION_DOCUMENT )
1558+ parse_as_document = true;
1559+ else
1560+ {
1561+ /* Parse and skip over the XML declaration, if any */
1562+ res_code = parse_xml_decl (utf8string ,
1563+ & count , & version , NULL , & standalone );
1564+ if (res_code != 0 )
1565+ xml_ereport_by_code (ERROR , ERRCODE_INVALID_XML_CONTENT ,
1566+ "invalid XML content: invalid XML declaration" ,
1567+ res_code );
1568+
1569+ /* Is there a DOCTYPE element? */
1570+ if (xml_doctype_in_content (utf8string + count ))
1571+ parse_as_document = true;
1572+ }
1573+
1574+ if (parse_as_document )
14611575 {
14621576 /*
14631577 * Note, that here we try to apply DTD defaults
@@ -1472,23 +1586,18 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace,
14721586 XML_PARSE_NOENT | XML_PARSE_DTDATTR
14731587 | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS ));
14741588 if (doc == NULL || xmlerrcxt -> err_occurred )
1475- xml_ereport (xmlerrcxt , ERROR , ERRCODE_INVALID_XML_DOCUMENT ,
1476- "invalid XML document" );
1589+ {
1590+ /* Use original option to decide which error code to throw */
1591+ if (xmloption_arg == XMLOPTION_DOCUMENT )
1592+ xml_ereport (xmlerrcxt , ERROR , ERRCODE_INVALID_XML_DOCUMENT ,
1593+ "invalid XML document" );
1594+ else
1595+ xml_ereport (xmlerrcxt , ERROR , ERRCODE_INVALID_XML_CONTENT ,
1596+ "invalid XML content" );
1597+ }
14771598 }
14781599 else
14791600 {
1480- int res_code ;
1481- size_t count ;
1482- xmlChar * version ;
1483- int standalone ;
1484-
1485- res_code = parse_xml_decl (utf8string ,
1486- & count , & version , NULL , & standalone );
1487- if (res_code != 0 )
1488- xml_ereport_by_code (ERROR , ERRCODE_INVALID_XML_CONTENT ,
1489- "invalid XML content: invalid XML declaration" ,
1490- res_code );
1491-
14921601 doc = xmlNewDoc (version );
14931602 Assert (doc -> encoding == NULL );
14941603 doc -> encoding = xmlStrdup ((const xmlChar * ) "UTF-8" );
0 commit comments