2121 * DEALINGS IN THE SOFTWARE.
2222 */
2323
24+ /*
25+ * The comments following this one that use the same comment syntax as this
26+ * comment are quotes from the HTML Standard at https://html.spec.whatwg.org/
27+ * as of 10 September 2020. That document came with this statement:
28+ * Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). This work is
29+ * licensed under a Creative Commons Attribution 4.0 International License.
30+ */
31+
2432package nu .validator .htmlparser .io ;
2533
2634import java .io .IOException ;
@@ -197,9 +205,8 @@ public void tokenize(InputSource is) throws SAXException, IOException {
197205 tokenizer .getErrorHandler (), tokenizer , this , heuristics );
198206 } else {
199207 if (this .characterEncoding != Encoding .UTF8 ) {
200- errorWithoutLocation ("Legacy encoding \u201C "
201- + this .characterEncoding .getCanonName ()
202- + "\u201D used. Documents must use UTF-8." );
208+ errorWithoutLocation (Encoding .msgLegacyEncoding (
209+ this .characterEncoding .getCanonName ()));
203210 }
204211 becomeConfident ();
205212 this .reader = new HtmlInputStreamReader (inputStream ,
@@ -333,50 +340,92 @@ public void setEncoding(Encoding encoding, Confidence confidence) {
333340 }
334341 }
335342
343+ private void errInternalActualDiffer (String internalCharset , String actual )
344+ throws SAXException {
345+ if (!internalCharset .equals (actual )) {
346+ tokenizer .errTreeBuilder (
347+ "Ignoring internal encoding declaration \u201C "
348+ + internalCharset + "\u201D , which disagrees with"
349+ + " the actual encoding of the document (\u201C "
350+ + actual + "\u201D )." );
351+ }
352+ }
353+
336354 public boolean internalEncodingDeclaration (String internalCharset )
337355 throws SAXException {
356+ String actual = characterEncoding .getCanonName ();
357+ if (confidence == Confidence .CERTAIN ) {
358+ errInternalActualDiffer (internalCharset , actual );
359+ return true ;
360+ }
361+ /* https://html.spec.whatwg.org/#changing-the-encoding-while-parsing */
338362 try {
363+ if ("utf-16be" .equals (actual ) || "utf-16le" .equals (actual )) {
364+ errInternalActualDiffer (internalCharset , actual );
365+ /*
366+ * 1. If the encoding that is already being used to interpret
367+ * the input stream is a UTF-16 encoding, then set the
368+ * confidence to certain and return. The new encoding is ignored
369+ * becomeConfident();
370+ */
371+ return true ;
372+ }
339373 internalCharset = internalCharset .toLowerCase ();
340374 Encoding cs = Encoding .forName (internalCharset );
341375 if ("utf-16be" .equals (internalCharset )
342376 || "utf-16le" .equals (internalCharset )) {
343- tokenizer .errTreeBuilder ("Internal encoding declaration specified \u201C "
344- + internalCharset
345- + "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201C utf-8\u201D ." );
377+ /*
378+ * 2. If the new encoding is a UTF-16 encoding, then change it
379+ * to UTF-8.
380+ */
381+ tokenizer .errTreeBuilder (
382+ Encoding .msgIgnoredCharset (internalCharset , "utf-8" ));
346383 cs = Encoding .UTF8 ;
347384 internalCharset = "utf-8" ;
348- } else {
349- cs = Encoding .forName (internalCharset );
350- }
351- Encoding actual = cs .getActualHtmlEncoding ();
352- if (actual == null ) {
353- actual = cs ;
385+ } else if ("x-user-defined" .equals (internalCharset )) {
386+ /*
387+ * 3. If the new encoding is x-user-defined, then change it to
388+ * windows-1252.
389+ */
390+ tokenizer .errTreeBuilder (Encoding .msgIgnoredCharset (
391+ "x-user-defined" , "windows-1252" ));
392+ cs = Encoding .WINDOWS1252 ;
393+ internalCharset = "windows-1252" ;
354394 }
355395 if (characterEncoding == null ) {
356396 // Reader case
357397 return true ;
358398 }
359- if (characterEncoding == actual ) {
399+ if (characterEncoding == cs ) {
400+ /*
401+ * 4. If the new encoding is identical or equivalent to the
402+ * encoding that is already being used to interpret the input
403+ * stream, then set the confidence to certain and return.
404+ */
360405 becomeConfident ();
361406 return true ;
362407 }
363- if (confidence == Confidence .CERTAIN && actual != characterEncoding ) {
364- tokenizer .errTreeBuilder ("Internal encoding declaration \u201C "
365- + internalCharset
366- + "\u201D disagrees with the actual encoding of the document (\u201C "
367- + characterEncoding .getCanonName () + "\u201D )." );
368- } else {
369- Encoding newEnc = whineAboutEncodingAndReturnCanonical (
370- internalCharset , cs );
371- tokenizer .errTreeBuilder ("Changing character encoding \u201C "
372- + internalCharset + "\u201D and reparsing." );
373- characterEncoding = newEnc ;
374- throw new ReparseException ();
375- }
376- return true ;
408+ /*
409+ * 6. Otherwise, navigate to the document again, with
410+ * historyHandling set to "replace", and using the same source
411+ * browsing context, but this time skip the encoding sniffing
412+ * algorithm and instead just set the encoding to the new encoding
413+ */
414+ Encoding newEnc = whineAboutEncodingAndReturnCanonical (
415+ internalCharset , cs );
416+ tokenizer .errTreeBuilder ("Changing character encoding to \u201C "
417+ + internalCharset + "\u201D and reparsing." );
418+ characterEncoding = newEnc ;
419+ // Note: We intentionally don’t call becomeConfident() at this
420+ // point. If we did, it would end up causing the exception
421+ // java.lang.IllegalStateException: rewind() after willNotRewind()
422+ // to be thrown later. So we are departing here from strictly
423+ // following the ordering in the corresponding spec language, which
424+ // specifies setting the confidence to "certain" at this point.
425+ throw new ReparseException ();
377426 } catch (UnsupportedCharsetException e ) {
378- tokenizer .errTreeBuilder ("Internal encoding declaration named an unsupported chararacter encoding \u201C "
379- + internalCharset + " \u201D ." );
427+ tokenizer .errTreeBuilder (
428+ Encoding . msgBadInternalCharset ( internalCharset ) );
380429 return false ;
381430 }
382431 }
@@ -436,8 +485,7 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
436485 }
437486 return whineAboutEncodingAndReturnCanonical (encoding , cs );
438487 } catch (UnsupportedCharsetException e ) {
439- tokenizer .err ("Unsupported character encoding name: \u201C " + encoding
440- + "\u201D . Will sniff." );
488+ tokenizer .err (Encoding .msgBadEncoding (encoding ) + " Will sniff." );
441489 swallowBom = true ;
442490 }
443491 return null ; // keep the compiler happy
@@ -453,7 +501,7 @@ protected Encoding whineAboutEncodingAndReturnCanonical(String encoding,
453501 Encoding cs ) throws SAXException {
454502 String canonName = cs .getCanonName ();
455503 if (!canonName .equals (encoding )) {
456- tokenizer .err (Encoding .msgNotPreferredName (encoding , canonName ));
504+ tokenizer .err (Encoding .msgNotCanonicalName (encoding , canonName ));
457505 }
458506 return cs ;
459507 }
0 commit comments