parse.go 59 KB


  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package html
  5. import (
  6. "errors"
  7. "fmt"
  8. "io"
  9. "strings"
  10. a "golang.org/x/net/html/atom"
  11. )
  12. // A parser implements the HTML5 parsing algorithm:
  13. // https://html.spec.whatwg.org/multipage/syntax.html#tree-construction
  14. type parser struct {
  15. // tokenizer provides the tokens for the parser.
  16. tokenizer *Tokenizer
  17. // tok is the most recently read token.
  18. tok Token
  19. // Self-closing tags like <hr/> are treated as start tags, except that
  20. // hasSelfClosingToken is set while they are being processed.
  21. hasSelfClosingToken bool
  22. // doc is the document root element.
  23. doc *Node
  24. // The stack of open elements (section 12.2.4.2) and active formatting
  25. // elements (section 12.2.4.3).
  26. oe, afe nodeStack
  27. // Element pointers (section 12.2.4.4).
  28. head, form *Node
  29. // Other parsing state flags (section 12.2.4.5).
  30. scripting, framesetOK bool
  31. // The stack of template insertion modes
  32. templateStack insertionModeStack
  33. // im is the current insertion mode.
  34. im insertionMode
  35. // originalIM is the insertion mode to go back to after completing a text
  36. // or inTableText insertion mode.
  37. originalIM insertionMode
  38. // fosterParenting is whether new elements should be inserted according to
  39. // the foster parenting rules (section 12.2.6.1).
  40. fosterParenting bool
  41. // quirks is whether the parser is operating in "quirks mode."
  42. quirks bool
  43. // fragment is whether the parser is parsing an HTML fragment.
  44. fragment bool
  45. // context is the context element when parsing an HTML fragment
  46. // (section 12.4).
  47. context *Node
  48. }
  49. func (p *parser) top() *Node {
  50. if n := p.oe.top(); n != nil {
  51. return n
  52. }
  53. return p.doc
  54. }
  55. // Stop tags for use in popUntil. These come from section 12.2.4.2.
  56. var (
  57. defaultScopeStopTags = map[string][]a.Atom{
  58. "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template},
  59. "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext},
  60. "svg": {a.Desc, a.ForeignObject, a.Title},
  61. }
  62. )
  63. type scope int
  64. const (
  65. defaultScope scope = iota
  66. listItemScope
  67. buttonScope
  68. tableScope
  69. tableRowScope
  70. tableBodyScope
  71. selectScope
  72. )
  73. // popUntil pops the stack of open elements at the highest element whose tag
  74. // is in matchTags, provided there is no higher element in the scope's stop
  75. // tags (as defined in section 12.2.4.2). It returns whether or not there was
  76. // such an element. If there was not, popUntil leaves the stack unchanged.
  77. //
  78. // For example, the set of stop tags for table scope is: "html", "table". If
  79. // the stack was:
  80. // ["html", "body", "font", "table", "b", "i", "u"]
  81. // then popUntil(tableScope, "font") would return false, but
  82. // popUntil(tableScope, "i") would return true and the stack would become:
  83. // ["html", "body", "font", "table", "b"]
  84. //
  85. // If an element's tag is in both the stop tags and matchTags, then the stack
  86. // will be popped and the function returns true (provided, of course, there was
  87. // no higher element in the stack that was also in the stop tags). For example,
  88. // popUntil(tableScope, "table") returns true and leaves:
  89. // ["html", "body", "font"]
  90. func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool {
  91. if i := p.indexOfElementInScope(s, matchTags...); i != -1 {
  92. p.oe = p.oe[:i]
  93. return true
  94. }
  95. return false
  96. }
  97. // indexOfElementInScope returns the index in p.oe of the highest element whose
  98. // tag is in matchTags that is in scope. If no matching element is in scope, it
  99. // returns -1.
  100. func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int {
  101. for i := len(p.oe) - 1; i >= 0; i-- {
  102. tagAtom := p.oe[i].DataAtom
  103. if p.oe[i].Namespace == "" {
  104. for _, t := range matchTags {
  105. if t == tagAtom {
  106. return i
  107. }
  108. }
  109. switch s {
  110. case defaultScope:
  111. // No-op.
  112. case listItemScope:
  113. if tagAtom == a.Ol || tagAtom == a.Ul {
  114. return -1
  115. }
  116. case buttonScope:
  117. if tagAtom == a.Button {
  118. return -1
  119. }
  120. case tableScope:
  121. if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
  122. return -1
  123. }
  124. case selectScope:
  125. if tagAtom != a.Optgroup && tagAtom != a.Option {
  126. return -1
  127. }
  128. default:
  129. panic("unreachable")
  130. }
  131. }
  132. switch s {
  133. case defaultScope, listItemScope, buttonScope:
  134. for _, t := range defaultScopeStopTags[p.oe[i].Namespace] {
  135. if t == tagAtom {
  136. return -1
  137. }
  138. }
  139. }
  140. }
  141. return -1
  142. }
  143. // elementInScope is like popUntil, except that it doesn't modify the stack of
  144. // open elements.
  145. func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool {
  146. return p.indexOfElementInScope(s, matchTags...) != -1
  147. }
  148. // clearStackToContext pops elements off the stack of open elements until a
  149. // scope-defined element is found.
  150. func (p *parser) clearStackToContext(s scope) {
  151. for i := len(p.oe) - 1; i >= 0; i-- {
  152. tagAtom := p.oe[i].DataAtom
  153. switch s {
  154. case tableScope:
  155. if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template {
  156. p.oe = p.oe[:i+1]
  157. return
  158. }
  159. case tableRowScope:
  160. if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template {
  161. p.oe = p.oe[:i+1]
  162. return
  163. }
  164. case tableBodyScope:
  165. if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template {
  166. p.oe = p.oe[:i+1]
  167. return
  168. }
  169. default:
  170. panic("unreachable")
  171. }
  172. }
  173. }
  174. // parseGenericRawTextElement implements the generic raw text element parsing
  175. // algorithm defined in 12.2.6.2.
  176. // https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text
  177. // TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part
  178. // officially, need to make tokenizer consider both states.
  179. func (p *parser) parseGenericRawTextElement() {
  180. p.addElement()
  181. p.originalIM = p.im
  182. p.im = textIM
  183. }
  184. // generateImpliedEndTags pops nodes off the stack of open elements as long as
  185. // the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc.
  186. // If exceptions are specified, nodes with that name will not be popped off.
  187. func (p *parser) generateImpliedEndTags(exceptions ...string) {
  188. var i int
  189. loop:
  190. for i = len(p.oe) - 1; i >= 0; i-- {
  191. n := p.oe[i]
  192. if n.Type != ElementNode {
  193. break
  194. }
  195. switch n.DataAtom {
  196. case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc:
  197. for _, except := range exceptions {
  198. if n.Data == except {
  199. break loop
  200. }
  201. }
  202. continue
  203. }
  204. break
  205. }
  206. p.oe = p.oe[:i+1]
  207. }
  208. // addChild adds a child node n to the top element, and pushes n onto the stack
  209. // of open elements if it is an element node.
  210. func (p *parser) addChild(n *Node) {
  211. if p.shouldFosterParent() {
  212. p.fosterParent(n)
  213. } else {
  214. p.top().AppendChild(n)
  215. }
  216. if n.Type == ElementNode {
  217. p.oe = append(p.oe, n)
  218. }
  219. }
  220. // shouldFosterParent returns whether the next node to be added should be
  221. // foster parented.
  222. func (p *parser) shouldFosterParent() bool {
  223. if p.fosterParenting {
  224. switch p.top().DataAtom {
  225. case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
  226. return true
  227. }
  228. }
  229. return false
  230. }
  231. // fosterParent adds a child node according to the foster parenting rules.
  232. // Section 12.2.6.1, "foster parenting".
  233. func (p *parser) fosterParent(n *Node) {
  234. var table, parent, prev, template *Node
  235. var i int
  236. for i = len(p.oe) - 1; i >= 0; i-- {
  237. if p.oe[i].DataAtom == a.Table {
  238. table = p.oe[i]
  239. break
  240. }
  241. }
  242. var j int
  243. for j = len(p.oe) - 1; j >= 0; j-- {
  244. if p.oe[j].DataAtom == a.Template {
  245. template = p.oe[j]
  246. break
  247. }
  248. }
  249. if template != nil && (table == nil || j > i) {
  250. template.AppendChild(n)
  251. return
  252. }
  253. if table == nil {
  254. // The foster parent is the html element.
  255. parent = p.oe[0]
  256. } else {
  257. parent = table.Parent
  258. }
  259. if parent == nil {
  260. parent = p.oe[i-1]
  261. }
  262. if table != nil {
  263. prev = table.PrevSibling
  264. } else {
  265. prev = parent.LastChild
  266. }
  267. if prev != nil && prev.Type == TextNode && n.Type == TextNode {
  268. prev.Data += n.Data
  269. return
  270. }
  271. parent.InsertBefore(n, table)
  272. }
  273. // addText adds text to the preceding node if it is a text node, or else it
  274. // calls addChild with a new text node.
  275. func (p *parser) addText(text string) {
  276. if text == "" {
  277. return
  278. }
  279. if p.shouldFosterParent() {
  280. p.fosterParent(&Node{
  281. Type: TextNode,
  282. Data: text,
  283. })
  284. return
  285. }
  286. t := p.top()
  287. if n := t.LastChild; n != nil && n.Type == TextNode {
  288. n.Data += text
  289. return
  290. }
  291. p.addChild(&Node{
  292. Type: TextNode,
  293. Data: text,
  294. })
  295. }
  296. // addElement adds a child element based on the current token.
  297. func (p *parser) addElement() {
  298. p.addChild(&Node{
  299. Type: ElementNode,
  300. DataAtom: p.tok.DataAtom,
  301. Data: p.tok.Data,
  302. Attr: p.tok.Attr,
  303. })
  304. }
  305. // Section 12.2.4.3.
  306. func (p *parser) addFormattingElement() {
  307. tagAtom, attr := p.tok.DataAtom, p.tok.Attr
  308. p.addElement()
  309. // Implement the Noah's Ark clause, but with three per family instead of two.
  310. identicalElements := 0
  311. findIdenticalElements:
  312. for i := len(p.afe) - 1; i >= 0; i-- {
  313. n := p.afe[i]
  314. if n.Type == scopeMarkerNode {
  315. break
  316. }
  317. if n.Type != ElementNode {
  318. continue
  319. }
  320. if n.Namespace != "" {
  321. continue
  322. }
  323. if n.DataAtom != tagAtom {
  324. continue
  325. }
  326. if len(n.Attr) != len(attr) {
  327. continue
  328. }
  329. compareAttributes:
  330. for _, t0 := range n.Attr {
  331. for _, t1 := range attr {
  332. if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
  333. // Found a match for this attribute, continue with the next attribute.
  334. continue compareAttributes
  335. }
  336. }
  337. // If we get here, there is no attribute that matches a.
  338. // Therefore the element is not identical to the new one.
  339. continue findIdenticalElements
  340. }
  341. identicalElements++
  342. if identicalElements >= 3 {
  343. p.afe.remove(n)
  344. }
  345. }
  346. p.afe = append(p.afe, p.top())
  347. }
  348. // Section 12.2.4.3.
  349. func (p *parser) clearActiveFormattingElements() {
  350. for {
  351. if n := p.afe.pop(); len(p.afe) == 0 || n.Type == scopeMarkerNode {
  352. return
  353. }
  354. }
  355. }
  356. // Section 12.2.4.3.
  357. func (p *parser) reconstructActiveFormattingElements() {
  358. n := p.afe.top()
  359. if n == nil {
  360. return
  361. }
  362. if n.Type == scopeMarkerNode || p.oe.index(n) != -1 {
  363. return
  364. }
  365. i := len(p.afe) - 1
  366. for n.Type != scopeMarkerNode && p.oe.index(n) == -1 {
  367. if i == 0 {
  368. i = -1
  369. break
  370. }
  371. i--
  372. n = p.afe[i]
  373. }
  374. for {
  375. i++
  376. clone := p.afe[i].clone()
  377. p.addChild(clone)
  378. p.afe[i] = clone
  379. if i == len(p.afe)-1 {
  380. break
  381. }
  382. }
  383. }
  384. // Section 12.2.5.
  385. func (p *parser) acknowledgeSelfClosingTag() {
  386. p.hasSelfClosingToken = false
  387. }
  388. // An insertion mode (section 12.2.4.1) is the state transition function from
  389. // a particular state in the HTML5 parser's state machine. It updates the
  390. // parser's fields depending on parser.tok (where ErrorToken means EOF).
  391. // It returns whether the token was consumed.
  392. type insertionMode func(*parser) bool
  393. // setOriginalIM sets the insertion mode to return to after completing a text or
  394. // inTableText insertion mode.
  395. // Section 12.2.4.1, "using the rules for".
  396. func (p *parser) setOriginalIM() {
  397. if p.originalIM != nil {
  398. panic("html: bad parser state: originalIM was set twice")
  399. }
  400. p.originalIM = p.im
  401. }
  402. // Section 12.2.4.1, "reset the insertion mode".
  403. func (p *parser) resetInsertionMode() {
  404. for i := len(p.oe) - 1; i >= 0; i-- {
  405. n := p.oe[i]
  406. last := i == 0
  407. if last && p.context != nil {
  408. n = p.context
  409. }
  410. switch n.DataAtom {
  411. case a.Select:
  412. if !last {
  413. for ancestor, first := n, p.oe[0]; ancestor != first; {
  414. ancestor = p.oe[p.oe.index(ancestor)-1]
  415. switch ancestor.DataAtom {
  416. case a.Template:
  417. p.im = inSelectIM
  418. return
  419. case a.Table:
  420. p.im = inSelectInTableIM
  421. return
  422. }
  423. }
  424. }
  425. p.im = inSelectIM
  426. case a.Td, a.Th:
  427. // TODO: remove this divergence from the HTML5 spec.
  428. //
  429. // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
  430. p.im = inCellIM
  431. case a.Tr:
  432. p.im = inRowIM
  433. case a.Tbody, a.Thead, a.Tfoot:
  434. p.im = inTableBodyIM
  435. case a.Caption:
  436. p.im = inCaptionIM
  437. case a.Colgroup:
  438. p.im = inColumnGroupIM
  439. case a.Table:
  440. p.im = inTableIM
  441. case a.Template:
  442. // TODO: remove this divergence from the HTML5 spec.
  443. if n.Namespace != "" {
  444. continue
  445. }
  446. p.im = p.templateStack.top()
  447. case a.Head:
  448. // TODO: remove this divergence from the HTML5 spec.
  449. //
  450. // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
  451. p.im = inHeadIM
  452. case a.Body:
  453. p.im = inBodyIM
  454. case a.Frameset:
  455. p.im = inFramesetIM
  456. case a.Html:
  457. if p.head == nil {
  458. p.im = beforeHeadIM
  459. } else {
  460. p.im = afterHeadIM
  461. }
  462. default:
  463. if last {
  464. p.im = inBodyIM
  465. return
  466. }
  467. continue
  468. }
  469. return
  470. }
  471. }
  472. const whitespace = " \t\r\n\f"
  473. // Section 12.2.6.4.1.
  474. func initialIM(p *parser) bool {
  475. switch p.tok.Type {
  476. case TextToken:
  477. p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
  478. if len(p.tok.Data) == 0 {
  479. // It was all whitespace, so ignore it.
  480. return true
  481. }
  482. case CommentToken:
  483. p.doc.AppendChild(&Node{
  484. Type: CommentNode,
  485. Data: p.tok.Data,
  486. })
  487. return true
  488. case DoctypeToken:
  489. n, quirks := parseDoctype(p.tok.Data)
  490. p.doc.AppendChild(n)
  491. p.quirks = quirks
  492. p.im = beforeHTMLIM
  493. return true
  494. }
  495. p.quirks = true
  496. p.im = beforeHTMLIM
  497. return false
  498. }
  499. // Section 12.2.6.4.2.
  500. func beforeHTMLIM(p *parser) bool {
  501. switch p.tok.Type {
  502. case DoctypeToken:
  503. // Ignore the token.
  504. return true
  505. case TextToken:
  506. p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
  507. if len(p.tok.Data) == 0 {
  508. // It was all whitespace, so ignore it.
  509. return true
  510. }
  511. case StartTagToken:
  512. if p.tok.DataAtom == a.Html {
  513. p.addElement()
  514. p.im = beforeHeadIM
  515. return true
  516. }
  517. case EndTagToken:
  518. switch p.tok.DataAtom {
  519. case a.Head, a.Body, a.Html, a.Br:
  520. p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
  521. return false
  522. default:
  523. // Ignore the token.
  524. return true
  525. }
  526. case CommentToken:
  527. p.doc.AppendChild(&Node{
  528. Type: CommentNode,
  529. Data: p.tok.Data,
  530. })
  531. return true
  532. }
  533. p.parseImpliedToken(StartTagToken, a.Html, a.Html.String())
  534. return false
  535. }
  536. // Section 12.2.6.4.3.
  537. func beforeHeadIM(p *parser) bool {
  538. switch p.tok.Type {
  539. case TextToken:
  540. p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace)
  541. if len(p.tok.Data) == 0 {
  542. // It was all whitespace, so ignore it.
  543. return true
  544. }
  545. case StartTagToken:
  546. switch p.tok.DataAtom {
  547. case a.Head:
  548. p.addElement()
  549. p.head = p.top()
  550. p.im = inHeadIM
  551. return true
  552. case a.Html:
  553. return inBodyIM(p)
  554. }
  555. case EndTagToken:
  556. switch p.tok.DataAtom {
  557. case a.Head, a.Body, a.Html, a.Br:
  558. p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
  559. return false
  560. default:
  561. // Ignore the token.
  562. return true
  563. }
  564. case CommentToken:
  565. p.addChild(&Node{
  566. Type: CommentNode,
  567. Data: p.tok.Data,
  568. })
  569. return true
  570. case DoctypeToken:
  571. // Ignore the token.
  572. return true
  573. }
  574. p.parseImpliedToken(StartTagToken, a.Head, a.Head.String())
  575. return false
  576. }
  577. // Section 12.2.6.4.4.
  578. func inHeadIM(p *parser) bool {
  579. switch p.tok.Type {
  580. case TextToken:
  581. s := strings.TrimLeft(p.tok.Data, whitespace)
  582. if len(s) < len(p.tok.Data) {
  583. // Add the initial whitespace to the current node.
  584. p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
  585. if s == "" {
  586. return true
  587. }
  588. p.tok.Data = s
  589. }
  590. case StartTagToken:
  591. switch p.tok.DataAtom {
  592. case a.Html:
  593. return inBodyIM(p)
  594. case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta:
  595. p.addElement()
  596. p.oe.pop()
  597. p.acknowledgeSelfClosingTag()
  598. return true
  599. case a.Noscript:
  600. if p.scripting {
  601. p.parseGenericRawTextElement()
  602. return true
  603. }
  604. p.addElement()
  605. p.im = inHeadNoscriptIM
  606. // Don't let the tokenizer go into raw text mode when scripting is disabled.
  607. p.tokenizer.NextIsNotRawText()
  608. return true
  609. case a.Script, a.Title:
  610. p.addElement()
  611. p.setOriginalIM()
  612. p.im = textIM
  613. return true
  614. case a.Noframes, a.Style:
  615. p.parseGenericRawTextElement()
  616. return true
  617. case a.Head:
  618. // Ignore the token.
  619. return true
  620. case a.Template:
  621. // TODO: remove this divergence from the HTML5 spec.
  622. //
  623. // We don't handle all of the corner cases when mixing foreign
  624. // content (i.e. <math> or <svg>) with <template>. Without this
  625. // early return, we can get into an infinite loop, possibly because
  626. // of the "TODO... further divergence" a little below.
  627. //
  628. // As a workaround, if we are mixing foreign content and templates,
  629. // just ignore the rest of the HTML. Foreign content is rare and a
  630. // relatively old HTML feature. Templates are also rare and a
  631. // relatively new HTML feature. Their combination is very rare.
  632. for _, e := range p.oe {
  633. if e.Namespace != "" {
  634. p.im = ignoreTheRemainingTokens
  635. return true
  636. }
  637. }
  638. p.addElement()
  639. p.afe = append(p.afe, &scopeMarker)
  640. p.framesetOK = false
  641. p.im = inTemplateIM
  642. p.templateStack = append(p.templateStack, inTemplateIM)
  643. return true
  644. }
  645. case EndTagToken:
  646. switch p.tok.DataAtom {
  647. case a.Head:
  648. p.oe.pop()
  649. p.im = afterHeadIM
  650. return true
  651. case a.Body, a.Html, a.Br:
  652. p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
  653. return false
  654. case a.Template:
  655. if !p.oe.contains(a.Template) {
  656. return true
  657. }
  658. // TODO: remove this further divergence from the HTML5 spec.
  659. //
  660. // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
  661. p.generateImpliedEndTags()
  662. for i := len(p.oe) - 1; i >= 0; i-- {
  663. if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
  664. p.oe = p.oe[:i]
  665. break
  666. }
  667. }
  668. p.clearActiveFormattingElements()
  669. p.templateStack.pop()
  670. p.resetInsertionMode()
  671. return true
  672. default:
  673. // Ignore the token.
  674. return true
  675. }
  676. case CommentToken:
  677. p.addChild(&Node{
  678. Type: CommentNode,
  679. Data: p.tok.Data,
  680. })
  681. return true
  682. case DoctypeToken:
  683. // Ignore the token.
  684. return true
  685. }
  686. p.parseImpliedToken(EndTagToken, a.Head, a.Head.String())
  687. return false
  688. }
  689. // Section 12.2.6.4.5.
  690. func inHeadNoscriptIM(p *parser) bool {
  691. switch p.tok.Type {
  692. case DoctypeToken:
  693. // Ignore the token.
  694. return true
  695. case StartTagToken:
  696. switch p.tok.DataAtom {
  697. case a.Html:
  698. return inBodyIM(p)
  699. case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style:
  700. return inHeadIM(p)
  701. case a.Head:
  702. // Ignore the token.
  703. return true
  704. case a.Noscript:
  705. // Don't let the tokenizer go into raw text mode even when a <noscript>
  706. // tag is in "in head noscript" insertion mode.
  707. p.tokenizer.NextIsNotRawText()
  708. // Ignore the token.
  709. return true
  710. }
  711. case EndTagToken:
  712. switch p.tok.DataAtom {
  713. case a.Noscript, a.Br:
  714. default:
  715. // Ignore the token.
  716. return true
  717. }
  718. case TextToken:
  719. s := strings.TrimLeft(p.tok.Data, whitespace)
  720. if len(s) == 0 {
  721. // It was all whitespace.
  722. return inHeadIM(p)
  723. }
  724. case CommentToken:
  725. return inHeadIM(p)
  726. }
  727. p.oe.pop()
  728. if p.top().DataAtom != a.Head {
  729. panic("html: the new current node will be a head element.")
  730. }
  731. p.im = inHeadIM
  732. if p.tok.DataAtom == a.Noscript {
  733. return true
  734. }
  735. return false
  736. }
  737. // Section 12.2.6.4.6.
  738. func afterHeadIM(p *parser) bool {
  739. switch p.tok.Type {
  740. case TextToken:
  741. s := strings.TrimLeft(p.tok.Data, whitespace)
  742. if len(s) < len(p.tok.Data) {
  743. // Add the initial whitespace to the current node.
  744. p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
  745. if s == "" {
  746. return true
  747. }
  748. p.tok.Data = s
  749. }
  750. case StartTagToken:
  751. switch p.tok.DataAtom {
  752. case a.Html:
  753. return inBodyIM(p)
  754. case a.Body:
  755. p.addElement()
  756. p.framesetOK = false
  757. p.im = inBodyIM
  758. return true
  759. case a.Frameset:
  760. p.addElement()
  761. p.im = inFramesetIM
  762. return true
  763. case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
  764. p.oe = append(p.oe, p.head)
  765. defer p.oe.remove(p.head)
  766. return inHeadIM(p)
  767. case a.Head:
  768. // Ignore the token.
  769. return true
  770. }
  771. case EndTagToken:
  772. switch p.tok.DataAtom {
  773. case a.Body, a.Html, a.Br:
  774. // Drop down to creating an implied <body> tag.
  775. case a.Template:
  776. return inHeadIM(p)
  777. default:
  778. // Ignore the token.
  779. return true
  780. }
  781. case CommentToken:
  782. p.addChild(&Node{
  783. Type: CommentNode,
  784. Data: p.tok.Data,
  785. })
  786. return true
  787. case DoctypeToken:
  788. // Ignore the token.
  789. return true
  790. }
  791. p.parseImpliedToken(StartTagToken, a.Body, a.Body.String())
  792. p.framesetOK = true
  793. if p.tok.Type == ErrorToken {
  794. // Stop parsing.
  795. return true
  796. }
  797. return false
  798. }
  799. // copyAttributes copies attributes of src not found on dst to dst.
  800. func copyAttributes(dst *Node, src Token) {
  801. if len(src.Attr) == 0 {
  802. return
  803. }
  804. attr := map[string]string{}
  805. for _, t := range dst.Attr {
  806. attr[t.Key] = t.Val
  807. }
  808. for _, t := range src.Attr {
  809. if _, ok := attr[t.Key]; !ok {
  810. dst.Attr = append(dst.Attr, t)
  811. attr[t.Key] = t.Val
  812. }
  813. }
  814. }
  815. // Section 12.2.6.4.7.
  816. func inBodyIM(p *parser) bool {
  817. switch p.tok.Type {
  818. case TextToken:
  819. d := p.tok.Data
  820. switch n := p.oe.top(); n.DataAtom {
  821. case a.Pre, a.Listing:
  822. if n.FirstChild == nil {
  823. // Ignore a newline at the start of a <pre> block.
  824. if d != "" && d[0] == '\r' {
  825. d = d[1:]
  826. }
  827. if d != "" && d[0] == '\n' {
  828. d = d[1:]
  829. }
  830. }
  831. }
  832. d = strings.Replace(d, "\x00", "", -1)
  833. if d == "" {
  834. return true
  835. }
  836. p.reconstructActiveFormattingElements()
  837. p.addText(d)
  838. if p.framesetOK && strings.TrimLeft(d, whitespace) != "" {
  839. // There were non-whitespace characters inserted.
  840. p.framesetOK = false
  841. }
  842. case StartTagToken:
  843. switch p.tok.DataAtom {
  844. case a.Html:
  845. if p.oe.contains(a.Template) {
  846. return true
  847. }
  848. copyAttributes(p.oe[0], p.tok)
  849. case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
  850. return inHeadIM(p)
  851. case a.Body:
  852. if p.oe.contains(a.Template) {
  853. return true
  854. }
  855. if len(p.oe) >= 2 {
  856. body := p.oe[1]
  857. if body.Type == ElementNode && body.DataAtom == a.Body {
  858. p.framesetOK = false
  859. copyAttributes(body, p.tok)
  860. }
  861. }
  862. case a.Frameset:
  863. if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body {
  864. // Ignore the token.
  865. return true
  866. }
  867. body := p.oe[1]
  868. if body.Parent != nil {
  869. body.Parent.RemoveChild(body)
  870. }
  871. p.oe = p.oe[:1]
  872. p.addElement()
  873. p.im = inFramesetIM
  874. return true
  875. case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Search, a.Section, a.Summary, a.Ul:
  876. p.popUntil(buttonScope, a.P)
  877. p.addElement()
  878. case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
  879. p.popUntil(buttonScope, a.P)
  880. switch n := p.top(); n.DataAtom {
  881. case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
  882. p.oe.pop()
  883. }
  884. p.addElement()
  885. case a.Pre, a.Listing:
  886. p.popUntil(buttonScope, a.P)
  887. p.addElement()
  888. // The newline, if any, will be dealt with by the TextToken case.
  889. p.framesetOK = false
  890. case a.Form:
  891. if p.form != nil && !p.oe.contains(a.Template) {
  892. // Ignore the token
  893. return true
  894. }
  895. p.popUntil(buttonScope, a.P)
  896. p.addElement()
  897. if !p.oe.contains(a.Template) {
  898. p.form = p.top()
  899. }
  900. case a.Li:
  901. p.framesetOK = false
  902. for i := len(p.oe) - 1; i >= 0; i-- {
  903. node := p.oe[i]
  904. switch node.DataAtom {
  905. case a.Li:
  906. p.oe = p.oe[:i]
  907. case a.Address, a.Div, a.P:
  908. continue
  909. default:
  910. if !isSpecialElement(node) {
  911. continue
  912. }
  913. }
  914. break
  915. }
  916. p.popUntil(buttonScope, a.P)
  917. p.addElement()
  918. case a.Dd, a.Dt:
  919. p.framesetOK = false
  920. for i := len(p.oe) - 1; i >= 0; i-- {
  921. node := p.oe[i]
  922. switch node.DataAtom {
  923. case a.Dd, a.Dt:
  924. p.oe = p.oe[:i]
  925. case a.Address, a.Div, a.P:
  926. continue
  927. default:
  928. if !isSpecialElement(node) {
  929. continue
  930. }
  931. }
  932. break
  933. }
  934. p.popUntil(buttonScope, a.P)
  935. p.addElement()
  936. case a.Plaintext:
  937. p.popUntil(buttonScope, a.P)
  938. p.addElement()
  939. case a.Button:
  940. p.popUntil(defaultScope, a.Button)
  941. p.reconstructActiveFormattingElements()
  942. p.addElement()
  943. p.framesetOK = false
  944. case a.A:
  945. for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- {
  946. if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A {
  947. p.inBodyEndTagFormatting(a.A, "a")
  948. p.oe.remove(n)
  949. p.afe.remove(n)
  950. break
  951. }
  952. }
  953. p.reconstructActiveFormattingElements()
  954. p.addFormattingElement()
  955. case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
  956. p.reconstructActiveFormattingElements()
  957. p.addFormattingElement()
  958. case a.Nobr:
  959. p.reconstructActiveFormattingElements()
  960. if p.elementInScope(defaultScope, a.Nobr) {
  961. p.inBodyEndTagFormatting(a.Nobr, "nobr")
  962. p.reconstructActiveFormattingElements()
  963. }
  964. p.addFormattingElement()
  965. case a.Applet, a.Marquee, a.Object:
  966. p.reconstructActiveFormattingElements()
  967. p.addElement()
  968. p.afe = append(p.afe, &scopeMarker)
  969. p.framesetOK = false
  970. case a.Table:
  971. if !p.quirks {
  972. p.popUntil(buttonScope, a.P)
  973. }
  974. p.addElement()
  975. p.framesetOK = false
  976. p.im = inTableIM
  977. return true
  978. case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr:
  979. p.reconstructActiveFormattingElements()
  980. p.addElement()
  981. p.oe.pop()
  982. p.acknowledgeSelfClosingTag()
  983. if p.tok.DataAtom == a.Input {
  984. for _, t := range p.tok.Attr {
  985. if t.Key == "type" {
  986. if strings.EqualFold(t.Val, "hidden") {
  987. // Skip setting framesetOK = false
  988. return true
  989. }
  990. }
  991. }
  992. }
  993. p.framesetOK = false
  994. case a.Param, a.Source, a.Track:
  995. p.addElement()
  996. p.oe.pop()
  997. p.acknowledgeSelfClosingTag()
  998. case a.Hr:
  999. p.popUntil(buttonScope, a.P)
  1000. p.addElement()
  1001. p.oe.pop()
  1002. p.acknowledgeSelfClosingTag()
  1003. p.framesetOK = false
  1004. case a.Image:
  1005. p.tok.DataAtom = a.Img
  1006. p.tok.Data = a.Img.String()
  1007. return false
  1008. case a.Textarea:
  1009. p.addElement()
  1010. p.setOriginalIM()
  1011. p.framesetOK = false
  1012. p.im = textIM
  1013. case a.Xmp:
  1014. p.popUntil(buttonScope, a.P)
  1015. p.reconstructActiveFormattingElements()
  1016. p.framesetOK = false
  1017. p.parseGenericRawTextElement()
  1018. case a.Iframe:
  1019. p.framesetOK = false
  1020. p.parseGenericRawTextElement()
  1021. case a.Noembed:
  1022. p.parseGenericRawTextElement()
  1023. case a.Noscript:
  1024. if p.scripting {
  1025. p.parseGenericRawTextElement()
  1026. return true
  1027. }
  1028. p.reconstructActiveFormattingElements()
  1029. p.addElement()
  1030. // Don't let the tokenizer go into raw text mode when scripting is disabled.
  1031. p.tokenizer.NextIsNotRawText()
  1032. case a.Select:
  1033. p.reconstructActiveFormattingElements()
  1034. p.addElement()
  1035. p.framesetOK = false
  1036. p.im = inSelectIM
  1037. return true
  1038. case a.Optgroup, a.Option:
  1039. if p.top().DataAtom == a.Option {
  1040. p.oe.pop()
  1041. }
  1042. p.reconstructActiveFormattingElements()
  1043. p.addElement()
  1044. case a.Rb, a.Rtc:
  1045. if p.elementInScope(defaultScope, a.Ruby) {
  1046. p.generateImpliedEndTags()
  1047. }
  1048. p.addElement()
  1049. case a.Rp, a.Rt:
  1050. if p.elementInScope(defaultScope, a.Ruby) {
  1051. p.generateImpliedEndTags("rtc")
  1052. }
  1053. p.addElement()
  1054. case a.Math, a.Svg:
  1055. p.reconstructActiveFormattingElements()
  1056. if p.tok.DataAtom == a.Math {
  1057. adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
  1058. } else {
  1059. adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
  1060. }
  1061. adjustForeignAttributes(p.tok.Attr)
  1062. p.addElement()
  1063. p.top().Namespace = p.tok.Data
  1064. if p.hasSelfClosingToken {
  1065. p.oe.pop()
  1066. p.acknowledgeSelfClosingTag()
  1067. }
  1068. return true
  1069. case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
  1070. // Ignore the token.
  1071. default:
  1072. p.reconstructActiveFormattingElements()
  1073. p.addElement()
  1074. }
  1075. case EndTagToken:
  1076. switch p.tok.DataAtom {
  1077. case a.Body:
  1078. if p.elementInScope(defaultScope, a.Body) {
  1079. p.im = afterBodyIM
  1080. }
  1081. case a.Html:
  1082. if p.elementInScope(defaultScope, a.Body) {
  1083. p.parseImpliedToken(EndTagToken, a.Body, a.Body.String())
  1084. return false
  1085. }
  1086. return true
  1087. case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Search, a.Section, a.Summary, a.Ul:
  1088. p.popUntil(defaultScope, p.tok.DataAtom)
  1089. case a.Form:
  1090. if p.oe.contains(a.Template) {
  1091. i := p.indexOfElementInScope(defaultScope, a.Form)
  1092. if i == -1 {
  1093. // Ignore the token.
  1094. return true
  1095. }
  1096. p.generateImpliedEndTags()
  1097. if p.oe[i].DataAtom != a.Form {
  1098. // Ignore the token.
  1099. return true
  1100. }
  1101. p.popUntil(defaultScope, a.Form)
  1102. } else {
  1103. node := p.form
  1104. p.form = nil
  1105. i := p.indexOfElementInScope(defaultScope, a.Form)
  1106. if node == nil || i == -1 || p.oe[i] != node {
  1107. // Ignore the token.
  1108. return true
  1109. }
  1110. p.generateImpliedEndTags()
  1111. p.oe.remove(node)
  1112. }
  1113. case a.P:
  1114. if !p.elementInScope(buttonScope, a.P) {
  1115. p.parseImpliedToken(StartTagToken, a.P, a.P.String())
  1116. }
  1117. p.popUntil(buttonScope, a.P)
  1118. case a.Li:
  1119. p.popUntil(listItemScope, a.Li)
  1120. case a.Dd, a.Dt:
  1121. p.popUntil(defaultScope, p.tok.DataAtom)
  1122. case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6:
  1123. p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6)
  1124. case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U:
  1125. p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data)
  1126. case a.Applet, a.Marquee, a.Object:
  1127. if p.popUntil(defaultScope, p.tok.DataAtom) {
  1128. p.clearActiveFormattingElements()
  1129. }
  1130. case a.Br:
  1131. p.tok.Type = StartTagToken
  1132. return false
  1133. case a.Template:
  1134. return inHeadIM(p)
  1135. default:
  1136. p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data)
  1137. }
  1138. case CommentToken:
  1139. p.addChild(&Node{
  1140. Type: CommentNode,
  1141. Data: p.tok.Data,
  1142. })
  1143. case ErrorToken:
  1144. // TODO: remove this divergence from the HTML5 spec.
  1145. if len(p.templateStack) > 0 {
  1146. p.im = inTemplateIM
  1147. return false
  1148. }
  1149. for _, e := range p.oe {
  1150. switch e.DataAtom {
  1151. case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th,
  1152. a.Thead, a.Tr, a.Body, a.Html:
  1153. default:
  1154. return true
  1155. }
  1156. }
  1157. }
  1158. return true
  1159. }
  1160. func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) {
  1161. // This is the "adoption agency" algorithm, described at
  1162. // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency
  1163. // TODO: this is a fairly literal line-by-line translation of that algorithm.
  1164. // Once the code successfully parses the comprehensive test suite, we should
  1165. // refactor this code to be more idiomatic.
  1166. // Steps 1-2
  1167. if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 {
  1168. p.oe.pop()
  1169. return
  1170. }
  1171. // Steps 3-5. The outer loop.
  1172. for i := 0; i < 8; i++ {
  1173. // Step 6. Find the formatting element.
  1174. var formattingElement *Node
  1175. for j := len(p.afe) - 1; j >= 0; j-- {
  1176. if p.afe[j].Type == scopeMarkerNode {
  1177. break
  1178. }
  1179. if p.afe[j].DataAtom == tagAtom {
  1180. formattingElement = p.afe[j]
  1181. break
  1182. }
  1183. }
  1184. if formattingElement == nil {
  1185. p.inBodyEndTagOther(tagAtom, tagName)
  1186. return
  1187. }
  1188. // Step 7. Ignore the tag if formatting element is not in the stack of open elements.
  1189. feIndex := p.oe.index(formattingElement)
  1190. if feIndex == -1 {
  1191. p.afe.remove(formattingElement)
  1192. return
  1193. }
  1194. // Step 8. Ignore the tag if formatting element is not in the scope.
  1195. if !p.elementInScope(defaultScope, tagAtom) {
  1196. // Ignore the tag.
  1197. return
  1198. }
  1199. // Step 9. This step is omitted because it's just a parse error but no need to return.
  1200. // Steps 10-11. Find the furthest block.
  1201. var furthestBlock *Node
  1202. for _, e := range p.oe[feIndex:] {
  1203. if isSpecialElement(e) {
  1204. furthestBlock = e
  1205. break
  1206. }
  1207. }
  1208. if furthestBlock == nil {
  1209. e := p.oe.pop()
  1210. for e != formattingElement {
  1211. e = p.oe.pop()
  1212. }
  1213. p.afe.remove(e)
  1214. return
  1215. }
  1216. // Steps 12-13. Find the common ancestor and bookmark node.
  1217. commonAncestor := p.oe[feIndex-1]
  1218. bookmark := p.afe.index(formattingElement)
  1219. // Step 14. The inner loop. Find the lastNode to reparent.
  1220. lastNode := furthestBlock
  1221. node := furthestBlock
  1222. x := p.oe.index(node)
  1223. // Step 14.1.
  1224. j := 0
  1225. for {
  1226. // Step 14.2.
  1227. j++
  1228. // Step. 14.3.
  1229. x--
  1230. node = p.oe[x]
  1231. // Step 14.4. Go to the next step if node is formatting element.
  1232. if node == formattingElement {
  1233. break
  1234. }
  1235. // Step 14.5. Remove node from the list of active formatting elements if
  1236. // inner loop counter is greater than three and node is in the list of
  1237. // active formatting elements.
  1238. if ni := p.afe.index(node); j > 3 && ni > -1 {
  1239. p.afe.remove(node)
  1240. // If any element of the list of active formatting elements is removed,
  1241. // we need to take care whether bookmark should be decremented or not.
  1242. // This is because the value of bookmark may exceed the size of the
  1243. // list by removing elements from the list.
  1244. if ni <= bookmark {
  1245. bookmark--
  1246. }
  1247. continue
  1248. }
  1249. // Step 14.6. Continue the next inner loop if node is not in the list of
  1250. // active formatting elements.
  1251. if p.afe.index(node) == -1 {
  1252. p.oe.remove(node)
  1253. continue
  1254. }
  1255. // Step 14.7.
  1256. clone := node.clone()
  1257. p.afe[p.afe.index(node)] = clone
  1258. p.oe[p.oe.index(node)] = clone
  1259. node = clone
  1260. // Step 14.8.
  1261. if lastNode == furthestBlock {
  1262. bookmark = p.afe.index(node) + 1
  1263. }
  1264. // Step 14.9.
  1265. if lastNode.Parent != nil {
  1266. lastNode.Parent.RemoveChild(lastNode)
  1267. }
  1268. node.AppendChild(lastNode)
  1269. // Step 14.10.
  1270. lastNode = node
  1271. }
  1272. // Step 15. Reparent lastNode to the common ancestor,
  1273. // or for misnested table nodes, to the foster parent.
  1274. if lastNode.Parent != nil {
  1275. lastNode.Parent.RemoveChild(lastNode)
  1276. }
  1277. switch commonAncestor.DataAtom {
  1278. case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
  1279. p.fosterParent(lastNode)
  1280. default:
  1281. commonAncestor.AppendChild(lastNode)
  1282. }
  1283. // Steps 16-18. Reparent nodes from the furthest block's children
  1284. // to a clone of the formatting element.
  1285. clone := formattingElement.clone()
  1286. reparentChildren(clone, furthestBlock)
  1287. furthestBlock.AppendChild(clone)
  1288. // Step 19. Fix up the list of active formatting elements.
  1289. if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark {
  1290. // Move the bookmark with the rest of the list.
  1291. bookmark--
  1292. }
  1293. p.afe.remove(formattingElement)
  1294. p.afe.insert(bookmark, clone)
  1295. // Step 20. Fix up the stack of open elements.
  1296. p.oe.remove(formattingElement)
  1297. p.oe.insert(p.oe.index(furthestBlock)+1, clone)
  1298. }
  1299. }
  1300. // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM.
  1301. // "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content
  1302. // https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign
  1303. func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) {
  1304. for i := len(p.oe) - 1; i >= 0; i-- {
  1305. // Two element nodes have the same tag if they have the same Data (a
  1306. // string-typed field). As an optimization, for common HTML tags, each
  1307. // Data string is assigned a unique, non-zero DataAtom (a uint32-typed
  1308. // field), since integer comparison is faster than string comparison.
  1309. // Uncommon (custom) tags get a zero DataAtom.
  1310. //
  1311. // The if condition here is equivalent to (p.oe[i].Data == tagName).
  1312. if (p.oe[i].DataAtom == tagAtom) &&
  1313. ((tagAtom != 0) || (p.oe[i].Data == tagName)) {
  1314. p.oe = p.oe[:i]
  1315. break
  1316. }
  1317. if isSpecialElement(p.oe[i]) {
  1318. break
  1319. }
  1320. }
  1321. }
  1322. // Section 12.2.6.4.8.
  1323. func textIM(p *parser) bool {
  1324. switch p.tok.Type {
  1325. case ErrorToken:
  1326. p.oe.pop()
  1327. case TextToken:
  1328. d := p.tok.Data
  1329. if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil {
  1330. // Ignore a newline at the start of a <textarea> block.
  1331. if d != "" && d[0] == '\r' {
  1332. d = d[1:]
  1333. }
  1334. if d != "" && d[0] == '\n' {
  1335. d = d[1:]
  1336. }
  1337. }
  1338. if d == "" {
  1339. return true
  1340. }
  1341. p.addText(d)
  1342. return true
  1343. case EndTagToken:
  1344. p.oe.pop()
  1345. }
  1346. p.im = p.originalIM
  1347. p.originalIM = nil
  1348. return p.tok.Type == EndTagToken
  1349. }
  1350. // Section 12.2.6.4.9.
  1351. func inTableIM(p *parser) bool {
  1352. switch p.tok.Type {
  1353. case TextToken:
  1354. p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1)
  1355. switch p.oe.top().DataAtom {
  1356. case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
  1357. if strings.Trim(p.tok.Data, whitespace) == "" {
  1358. p.addText(p.tok.Data)
  1359. return true
  1360. }
  1361. }
  1362. case StartTagToken:
  1363. switch p.tok.DataAtom {
  1364. case a.Caption:
  1365. p.clearStackToContext(tableScope)
  1366. p.afe = append(p.afe, &scopeMarker)
  1367. p.addElement()
  1368. p.im = inCaptionIM
  1369. return true
  1370. case a.Colgroup:
  1371. p.clearStackToContext(tableScope)
  1372. p.addElement()
  1373. p.im = inColumnGroupIM
  1374. return true
  1375. case a.Col:
  1376. p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String())
  1377. return false
  1378. case a.Tbody, a.Tfoot, a.Thead:
  1379. p.clearStackToContext(tableScope)
  1380. p.addElement()
  1381. p.im = inTableBodyIM
  1382. return true
  1383. case a.Td, a.Th, a.Tr:
  1384. p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String())
  1385. return false
  1386. case a.Table:
  1387. if p.popUntil(tableScope, a.Table) {
  1388. p.resetInsertionMode()
  1389. return false
  1390. }
  1391. // Ignore the token.
  1392. return true
  1393. case a.Style, a.Script, a.Template:
  1394. return inHeadIM(p)
  1395. case a.Input:
  1396. for _, t := range p.tok.Attr {
  1397. if t.Key == "type" && strings.EqualFold(t.Val, "hidden") {
  1398. p.addElement()
  1399. p.oe.pop()
  1400. return true
  1401. }
  1402. }
  1403. // Otherwise drop down to the default action.
  1404. case a.Form:
  1405. if p.oe.contains(a.Template) || p.form != nil {
  1406. // Ignore the token.
  1407. return true
  1408. }
  1409. p.addElement()
  1410. p.form = p.oe.pop()
  1411. case a.Select:
  1412. p.reconstructActiveFormattingElements()
  1413. switch p.top().DataAtom {
  1414. case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
  1415. p.fosterParenting = true
  1416. }
  1417. p.addElement()
  1418. p.fosterParenting = false
  1419. p.framesetOK = false
  1420. p.im = inSelectInTableIM
  1421. return true
  1422. }
  1423. case EndTagToken:
  1424. switch p.tok.DataAtom {
  1425. case a.Table:
  1426. if p.popUntil(tableScope, a.Table) {
  1427. p.resetInsertionMode()
  1428. return true
  1429. }
  1430. // Ignore the token.
  1431. return true
  1432. case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
  1433. // Ignore the token.
  1434. return true
  1435. case a.Template:
  1436. return inHeadIM(p)
  1437. }
  1438. case CommentToken:
  1439. p.addChild(&Node{
  1440. Type: CommentNode,
  1441. Data: p.tok.Data,
  1442. })
  1443. return true
  1444. case DoctypeToken:
  1445. // Ignore the token.
  1446. return true
  1447. case ErrorToken:
  1448. return inBodyIM(p)
  1449. }
  1450. p.fosterParenting = true
  1451. defer func() { p.fosterParenting = false }()
  1452. return inBodyIM(p)
  1453. }
  1454. // Section 12.2.6.4.11.
  1455. func inCaptionIM(p *parser) bool {
  1456. switch p.tok.Type {
  1457. case StartTagToken:
  1458. switch p.tok.DataAtom {
  1459. case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr:
  1460. if !p.popUntil(tableScope, a.Caption) {
  1461. // Ignore the token.
  1462. return true
  1463. }
  1464. p.clearActiveFormattingElements()
  1465. p.im = inTableIM
  1466. return false
  1467. case a.Select:
  1468. p.reconstructActiveFormattingElements()
  1469. p.addElement()
  1470. p.framesetOK = false
  1471. p.im = inSelectInTableIM
  1472. return true
  1473. }
  1474. case EndTagToken:
  1475. switch p.tok.DataAtom {
  1476. case a.Caption:
  1477. if p.popUntil(tableScope, a.Caption) {
  1478. p.clearActiveFormattingElements()
  1479. p.im = inTableIM
  1480. }
  1481. return true
  1482. case a.Table:
  1483. if !p.popUntil(tableScope, a.Caption) {
  1484. // Ignore the token.
  1485. return true
  1486. }
  1487. p.clearActiveFormattingElements()
  1488. p.im = inTableIM
  1489. return false
  1490. case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
  1491. // Ignore the token.
  1492. return true
  1493. }
  1494. }
  1495. return inBodyIM(p)
  1496. }
  1497. // Section 12.2.6.4.12.
  1498. func inColumnGroupIM(p *parser) bool {
  1499. switch p.tok.Type {
  1500. case TextToken:
  1501. s := strings.TrimLeft(p.tok.Data, whitespace)
  1502. if len(s) < len(p.tok.Data) {
  1503. // Add the initial whitespace to the current node.
  1504. p.addText(p.tok.Data[:len(p.tok.Data)-len(s)])
  1505. if s == "" {
  1506. return true
  1507. }
  1508. p.tok.Data = s
  1509. }
  1510. case CommentToken:
  1511. p.addChild(&Node{
  1512. Type: CommentNode,
  1513. Data: p.tok.Data,
  1514. })
  1515. return true
  1516. case DoctypeToken:
  1517. // Ignore the token.
  1518. return true
  1519. case StartTagToken:
  1520. switch p.tok.DataAtom {
  1521. case a.Html:
  1522. return inBodyIM(p)
  1523. case a.Col:
  1524. p.addElement()
  1525. p.oe.pop()
  1526. p.acknowledgeSelfClosingTag()
  1527. return true
  1528. case a.Template:
  1529. return inHeadIM(p)
  1530. }
  1531. case EndTagToken:
  1532. switch p.tok.DataAtom {
  1533. case a.Colgroup:
  1534. if p.oe.top().DataAtom == a.Colgroup {
  1535. p.oe.pop()
  1536. p.im = inTableIM
  1537. }
  1538. return true
  1539. case a.Col:
  1540. // Ignore the token.
  1541. return true
  1542. case a.Template:
  1543. return inHeadIM(p)
  1544. }
  1545. case ErrorToken:
  1546. return inBodyIM(p)
  1547. }
  1548. if p.oe.top().DataAtom != a.Colgroup {
  1549. return true
  1550. }
  1551. p.oe.pop()
  1552. p.im = inTableIM
  1553. return false
  1554. }
  1555. // Section 12.2.6.4.13.
  1556. func inTableBodyIM(p *parser) bool {
  1557. switch p.tok.Type {
  1558. case StartTagToken:
  1559. switch p.tok.DataAtom {
  1560. case a.Tr:
  1561. p.clearStackToContext(tableBodyScope)
  1562. p.addElement()
  1563. p.im = inRowIM
  1564. return true
  1565. case a.Td, a.Th:
  1566. p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String())
  1567. return false
  1568. case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
  1569. if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
  1570. p.im = inTableIM
  1571. return false
  1572. }
  1573. // Ignore the token.
  1574. return true
  1575. }
  1576. case EndTagToken:
  1577. switch p.tok.DataAtom {
  1578. case a.Tbody, a.Tfoot, a.Thead:
  1579. if p.elementInScope(tableScope, p.tok.DataAtom) {
  1580. p.clearStackToContext(tableBodyScope)
  1581. p.oe.pop()
  1582. p.im = inTableIM
  1583. }
  1584. return true
  1585. case a.Table:
  1586. if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) {
  1587. p.im = inTableIM
  1588. return false
  1589. }
  1590. // Ignore the token.
  1591. return true
  1592. case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr:
  1593. // Ignore the token.
  1594. return true
  1595. }
  1596. case CommentToken:
  1597. p.addChild(&Node{
  1598. Type: CommentNode,
  1599. Data: p.tok.Data,
  1600. })
  1601. return true
  1602. }
  1603. return inTableIM(p)
  1604. }
  1605. // Section 12.2.6.4.14.
  1606. func inRowIM(p *parser) bool {
  1607. switch p.tok.Type {
  1608. case StartTagToken:
  1609. switch p.tok.DataAtom {
  1610. case a.Td, a.Th:
  1611. p.clearStackToContext(tableRowScope)
  1612. p.addElement()
  1613. p.afe = append(p.afe, &scopeMarker)
  1614. p.im = inCellIM
  1615. return true
  1616. case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr:
  1617. if p.popUntil(tableScope, a.Tr) {
  1618. p.im = inTableBodyIM
  1619. return false
  1620. }
  1621. // Ignore the token.
  1622. return true
  1623. }
  1624. case EndTagToken:
  1625. switch p.tok.DataAtom {
  1626. case a.Tr:
  1627. if p.popUntil(tableScope, a.Tr) {
  1628. p.im = inTableBodyIM
  1629. return true
  1630. }
  1631. // Ignore the token.
  1632. return true
  1633. case a.Table:
  1634. if p.popUntil(tableScope, a.Tr) {
  1635. p.im = inTableBodyIM
  1636. return false
  1637. }
  1638. // Ignore the token.
  1639. return true
  1640. case a.Tbody, a.Tfoot, a.Thead:
  1641. if p.elementInScope(tableScope, p.tok.DataAtom) {
  1642. p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String())
  1643. return false
  1644. }
  1645. // Ignore the token.
  1646. return true
  1647. case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th:
  1648. // Ignore the token.
  1649. return true
  1650. }
  1651. }
  1652. return inTableIM(p)
  1653. }
  1654. // Section 12.2.6.4.15.
  1655. func inCellIM(p *parser) bool {
  1656. switch p.tok.Type {
  1657. case StartTagToken:
  1658. switch p.tok.DataAtom {
  1659. case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr:
  1660. if p.popUntil(tableScope, a.Td, a.Th) {
  1661. // Close the cell and reprocess.
  1662. p.clearActiveFormattingElements()
  1663. p.im = inRowIM
  1664. return false
  1665. }
  1666. // Ignore the token.
  1667. return true
  1668. case a.Select:
  1669. p.reconstructActiveFormattingElements()
  1670. p.addElement()
  1671. p.framesetOK = false
  1672. p.im = inSelectInTableIM
  1673. return true
  1674. }
  1675. case EndTagToken:
  1676. switch p.tok.DataAtom {
  1677. case a.Td, a.Th:
  1678. if !p.popUntil(tableScope, p.tok.DataAtom) {
  1679. // Ignore the token.
  1680. return true
  1681. }
  1682. p.clearActiveFormattingElements()
  1683. p.im = inRowIM
  1684. return true
  1685. case a.Body, a.Caption, a.Col, a.Colgroup, a.Html:
  1686. // Ignore the token.
  1687. return true
  1688. case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr:
  1689. if !p.elementInScope(tableScope, p.tok.DataAtom) {
  1690. // Ignore the token.
  1691. return true
  1692. }
  1693. // Close the cell and reprocess.
  1694. if p.popUntil(tableScope, a.Td, a.Th) {
  1695. p.clearActiveFormattingElements()
  1696. }
  1697. p.im = inRowIM
  1698. return false
  1699. }
  1700. }
  1701. return inBodyIM(p)
  1702. }
  1703. // Section 12.2.6.4.16.
  1704. func inSelectIM(p *parser) bool {
  1705. switch p.tok.Type {
  1706. case TextToken:
  1707. p.addText(strings.Replace(p.tok.Data, "\x00", "", -1))
  1708. case StartTagToken:
  1709. switch p.tok.DataAtom {
  1710. case a.Html:
  1711. return inBodyIM(p)
  1712. case a.Option:
  1713. if p.top().DataAtom == a.Option {
  1714. p.oe.pop()
  1715. }
  1716. p.addElement()
  1717. case a.Optgroup:
  1718. if p.top().DataAtom == a.Option {
  1719. p.oe.pop()
  1720. }
  1721. if p.top().DataAtom == a.Optgroup {
  1722. p.oe.pop()
  1723. }
  1724. p.addElement()
  1725. case a.Select:
  1726. if !p.popUntil(selectScope, a.Select) {
  1727. // Ignore the token.
  1728. return true
  1729. }
  1730. p.resetInsertionMode()
  1731. case a.Input, a.Keygen, a.Textarea:
  1732. if p.elementInScope(selectScope, a.Select) {
  1733. p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
  1734. return false
  1735. }
  1736. // In order to properly ignore <textarea>, we need to change the tokenizer mode.
  1737. p.tokenizer.NextIsNotRawText()
  1738. // Ignore the token.
  1739. return true
  1740. case a.Script, a.Template:
  1741. return inHeadIM(p)
  1742. case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp:
  1743. // Don't let the tokenizer go into raw text mode when there are raw tags
  1744. // to be ignored. These tags should be ignored from the tokenizer
  1745. // properly.
  1746. p.tokenizer.NextIsNotRawText()
  1747. // Ignore the token.
  1748. return true
  1749. }
  1750. case EndTagToken:
  1751. switch p.tok.DataAtom {
  1752. case a.Option:
  1753. if p.top().DataAtom == a.Option {
  1754. p.oe.pop()
  1755. }
  1756. case a.Optgroup:
  1757. i := len(p.oe) - 1
  1758. if p.oe[i].DataAtom == a.Option {
  1759. i--
  1760. }
  1761. if p.oe[i].DataAtom == a.Optgroup {
  1762. p.oe = p.oe[:i]
  1763. }
  1764. case a.Select:
  1765. if !p.popUntil(selectScope, a.Select) {
  1766. // Ignore the token.
  1767. return true
  1768. }
  1769. p.resetInsertionMode()
  1770. case a.Template:
  1771. return inHeadIM(p)
  1772. }
  1773. case CommentToken:
  1774. p.addChild(&Node{
  1775. Type: CommentNode,
  1776. Data: p.tok.Data,
  1777. })
  1778. case DoctypeToken:
  1779. // Ignore the token.
  1780. return true
  1781. case ErrorToken:
  1782. return inBodyIM(p)
  1783. }
  1784. return true
  1785. }
  1786. // Section 12.2.6.4.17.
  1787. func inSelectInTableIM(p *parser) bool {
  1788. switch p.tok.Type {
  1789. case StartTagToken, EndTagToken:
  1790. switch p.tok.DataAtom {
  1791. case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th:
  1792. if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) {
  1793. // Ignore the token.
  1794. return true
  1795. }
  1796. // This is like p.popUntil(selectScope, a.Select), but it also
  1797. // matches <math select>, not just <select>. Matching the MathML
  1798. // tag is arguably incorrect (conceptually), but it mimics what
  1799. // Chromium does.
  1800. for i := len(p.oe) - 1; i >= 0; i-- {
  1801. if n := p.oe[i]; n.DataAtom == a.Select {
  1802. p.oe = p.oe[:i]
  1803. break
  1804. }
  1805. }
  1806. p.resetInsertionMode()
  1807. return false
  1808. }
  1809. }
  1810. return inSelectIM(p)
  1811. }
  1812. // Section 12.2.6.4.18.
  1813. func inTemplateIM(p *parser) bool {
  1814. switch p.tok.Type {
  1815. case TextToken, CommentToken, DoctypeToken:
  1816. return inBodyIM(p)
  1817. case StartTagToken:
  1818. switch p.tok.DataAtom {
  1819. case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title:
  1820. return inHeadIM(p)
  1821. case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead:
  1822. p.templateStack.pop()
  1823. p.templateStack = append(p.templateStack, inTableIM)
  1824. p.im = inTableIM
  1825. return false
  1826. case a.Col:
  1827. p.templateStack.pop()
  1828. p.templateStack = append(p.templateStack, inColumnGroupIM)
  1829. p.im = inColumnGroupIM
  1830. return false
  1831. case a.Tr:
  1832. p.templateStack.pop()
  1833. p.templateStack = append(p.templateStack, inTableBodyIM)
  1834. p.im = inTableBodyIM
  1835. return false
  1836. case a.Td, a.Th:
  1837. p.templateStack.pop()
  1838. p.templateStack = append(p.templateStack, inRowIM)
  1839. p.im = inRowIM
  1840. return false
  1841. default:
  1842. p.templateStack.pop()
  1843. p.templateStack = append(p.templateStack, inBodyIM)
  1844. p.im = inBodyIM
  1845. return false
  1846. }
  1847. case EndTagToken:
  1848. switch p.tok.DataAtom {
  1849. case a.Template:
  1850. return inHeadIM(p)
  1851. default:
  1852. // Ignore the token.
  1853. return true
  1854. }
  1855. case ErrorToken:
  1856. if !p.oe.contains(a.Template) {
  1857. // Ignore the token.
  1858. return true
  1859. }
  1860. // TODO: remove this divergence from the HTML5 spec.
  1861. //
  1862. // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668
  1863. p.generateImpliedEndTags()
  1864. for i := len(p.oe) - 1; i >= 0; i-- {
  1865. if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template {
  1866. p.oe = p.oe[:i]
  1867. break
  1868. }
  1869. }
  1870. p.clearActiveFormattingElements()
  1871. p.templateStack.pop()
  1872. p.resetInsertionMode()
  1873. return false
  1874. }
  1875. return false
  1876. }
  1877. // Section 12.2.6.4.19.
  1878. func afterBodyIM(p *parser) bool {
  1879. switch p.tok.Type {
  1880. case ErrorToken:
  1881. // Stop parsing.
  1882. return true
  1883. case TextToken:
  1884. s := strings.TrimLeft(p.tok.Data, whitespace)
  1885. if len(s) == 0 {
  1886. // It was all whitespace.
  1887. return inBodyIM(p)
  1888. }
  1889. case StartTagToken:
  1890. if p.tok.DataAtom == a.Html {
  1891. return inBodyIM(p)
  1892. }
  1893. case EndTagToken:
  1894. if p.tok.DataAtom == a.Html {
  1895. if !p.fragment {
  1896. p.im = afterAfterBodyIM
  1897. }
  1898. return true
  1899. }
  1900. case CommentToken:
  1901. // The comment is attached to the <html> element.
  1902. if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html {
  1903. panic("html: bad parser state: <html> element not found, in the after-body insertion mode")
  1904. }
  1905. p.oe[0].AppendChild(&Node{
  1906. Type: CommentNode,
  1907. Data: p.tok.Data,
  1908. })
  1909. return true
  1910. }
  1911. p.im = inBodyIM
  1912. return false
  1913. }
  1914. // Section 12.2.6.4.20.
  1915. func inFramesetIM(p *parser) bool {
  1916. switch p.tok.Type {
  1917. case CommentToken:
  1918. p.addChild(&Node{
  1919. Type: CommentNode,
  1920. Data: p.tok.Data,
  1921. })
  1922. case TextToken:
  1923. // Ignore all text but whitespace.
  1924. s := strings.Map(func(c rune) rune {
  1925. switch c {
  1926. case ' ', '\t', '\n', '\f', '\r':
  1927. return c
  1928. }
  1929. return -1
  1930. }, p.tok.Data)
  1931. if s != "" {
  1932. p.addText(s)
  1933. }
  1934. case StartTagToken:
  1935. switch p.tok.DataAtom {
  1936. case a.Html:
  1937. return inBodyIM(p)
  1938. case a.Frameset:
  1939. p.addElement()
  1940. case a.Frame:
  1941. p.addElement()
  1942. p.oe.pop()
  1943. p.acknowledgeSelfClosingTag()
  1944. case a.Noframes:
  1945. return inHeadIM(p)
  1946. }
  1947. case EndTagToken:
  1948. switch p.tok.DataAtom {
  1949. case a.Frameset:
  1950. if p.oe.top().DataAtom != a.Html {
  1951. p.oe.pop()
  1952. if p.oe.top().DataAtom != a.Frameset {
  1953. p.im = afterFramesetIM
  1954. return true
  1955. }
  1956. }
  1957. }
  1958. default:
  1959. // Ignore the token.
  1960. }
  1961. return true
  1962. }
  1963. // Section 12.2.6.4.21.
  1964. func afterFramesetIM(p *parser) bool {
  1965. switch p.tok.Type {
  1966. case CommentToken:
  1967. p.addChild(&Node{
  1968. Type: CommentNode,
  1969. Data: p.tok.Data,
  1970. })
  1971. case TextToken:
  1972. // Ignore all text but whitespace.
  1973. s := strings.Map(func(c rune) rune {
  1974. switch c {
  1975. case ' ', '\t', '\n', '\f', '\r':
  1976. return c
  1977. }
  1978. return -1
  1979. }, p.tok.Data)
  1980. if s != "" {
  1981. p.addText(s)
  1982. }
  1983. case StartTagToken:
  1984. switch p.tok.DataAtom {
  1985. case a.Html:
  1986. return inBodyIM(p)
  1987. case a.Noframes:
  1988. return inHeadIM(p)
  1989. }
  1990. case EndTagToken:
  1991. switch p.tok.DataAtom {
  1992. case a.Html:
  1993. p.im = afterAfterFramesetIM
  1994. return true
  1995. }
  1996. default:
  1997. // Ignore the token.
  1998. }
  1999. return true
  2000. }
  2001. // Section 12.2.6.4.22.
  2002. func afterAfterBodyIM(p *parser) bool {
  2003. switch p.tok.Type {
  2004. case ErrorToken:
  2005. // Stop parsing.
  2006. return true
  2007. case TextToken:
  2008. s := strings.TrimLeft(p.tok.Data, whitespace)
  2009. if len(s) == 0 {
  2010. // It was all whitespace.
  2011. return inBodyIM(p)
  2012. }
  2013. case StartTagToken:
  2014. if p.tok.DataAtom == a.Html {
  2015. return inBodyIM(p)
  2016. }
  2017. case CommentToken:
  2018. p.doc.AppendChild(&Node{
  2019. Type: CommentNode,
  2020. Data: p.tok.Data,
  2021. })
  2022. return true
  2023. case DoctypeToken:
  2024. return inBodyIM(p)
  2025. }
  2026. p.im = inBodyIM
  2027. return false
  2028. }
  2029. // Section 12.2.6.4.23.
  2030. func afterAfterFramesetIM(p *parser) bool {
  2031. switch p.tok.Type {
  2032. case CommentToken:
  2033. p.doc.AppendChild(&Node{
  2034. Type: CommentNode,
  2035. Data: p.tok.Data,
  2036. })
  2037. case TextToken:
  2038. // Ignore all text but whitespace.
  2039. s := strings.Map(func(c rune) rune {
  2040. switch c {
  2041. case ' ', '\t', '\n', '\f', '\r':
  2042. return c
  2043. }
  2044. return -1
  2045. }, p.tok.Data)
  2046. if s != "" {
  2047. p.tok.Data = s
  2048. return inBodyIM(p)
  2049. }
  2050. case StartTagToken:
  2051. switch p.tok.DataAtom {
  2052. case a.Html:
  2053. return inBodyIM(p)
  2054. case a.Noframes:
  2055. return inHeadIM(p)
  2056. }
  2057. case DoctypeToken:
  2058. return inBodyIM(p)
  2059. default:
  2060. // Ignore the token.
  2061. }
  2062. return true
  2063. }
  2064. func ignoreTheRemainingTokens(p *parser) bool {
  2065. return true
  2066. }
  2067. const whitespaceOrNUL = whitespace + "\x00"
  2068. // Section 12.2.6.5
  2069. func parseForeignContent(p *parser) bool {
  2070. switch p.tok.Type {
  2071. case TextToken:
  2072. if p.framesetOK {
  2073. p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == ""
  2074. }
  2075. p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1)
  2076. p.addText(p.tok.Data)
  2077. case CommentToken:
  2078. p.addChild(&Node{
  2079. Type: CommentNode,
  2080. Data: p.tok.Data,
  2081. })
  2082. case StartTagToken:
  2083. if !p.fragment {
  2084. b := breakout[p.tok.Data]
  2085. if p.tok.DataAtom == a.Font {
  2086. loop:
  2087. for _, attr := range p.tok.Attr {
  2088. switch attr.Key {
  2089. case "color", "face", "size":
  2090. b = true
  2091. break loop
  2092. }
  2093. }
  2094. }
  2095. if b {
  2096. for i := len(p.oe) - 1; i >= 0; i-- {
  2097. n := p.oe[i]
  2098. if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) {
  2099. p.oe = p.oe[:i+1]
  2100. break
  2101. }
  2102. }
  2103. return false
  2104. }
  2105. }
  2106. current := p.adjustedCurrentNode()
  2107. switch current.Namespace {
  2108. case "math":
  2109. adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments)
  2110. case "svg":
  2111. // Adjust SVG tag names. The tokenizer lower-cases tag names, but
  2112. // SVG wants e.g. "foreignObject" with a capital second "O".
  2113. if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
  2114. p.tok.DataAtom = a.Lookup([]byte(x))
  2115. p.tok.Data = x
  2116. }
  2117. adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
  2118. default:
  2119. panic("html: bad parser state: unexpected namespace")
  2120. }
  2121. adjustForeignAttributes(p.tok.Attr)
  2122. namespace := current.Namespace
  2123. p.addElement()
  2124. p.top().Namespace = namespace
  2125. if namespace != "" {
  2126. // Don't let the tokenizer go into raw text mode in foreign content
  2127. // (e.g. in an SVG <title> tag).
  2128. p.tokenizer.NextIsNotRawText()
  2129. }
  2130. if p.hasSelfClosingToken {
  2131. p.oe.pop()
  2132. p.acknowledgeSelfClosingTag()
  2133. }
  2134. case EndTagToken:
  2135. for i := len(p.oe) - 1; i >= 0; i-- {
  2136. if p.oe[i].Namespace == "" {
  2137. return p.im(p)
  2138. }
  2139. if strings.EqualFold(p.oe[i].Data, p.tok.Data) {
  2140. p.oe = p.oe[:i]
  2141. break
  2142. }
  2143. }
  2144. return true
  2145. default:
  2146. // Ignore the token.
  2147. }
  2148. return true
  2149. }
  2150. // Section 12.2.4.2.
  2151. func (p *parser) adjustedCurrentNode() *Node {
  2152. if len(p.oe) == 1 && p.fragment && p.context != nil {
  2153. return p.context
  2154. }
  2155. return p.oe.top()
  2156. }
  2157. // Section 12.2.6.
  2158. func (p *parser) inForeignContent() bool {
  2159. if len(p.oe) == 0 {
  2160. return false
  2161. }
  2162. n := p.adjustedCurrentNode()
  2163. if n.Namespace == "" {
  2164. return false
  2165. }
  2166. if mathMLTextIntegrationPoint(n) {
  2167. if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark {
  2168. return false
  2169. }
  2170. if p.tok.Type == TextToken {
  2171. return false
  2172. }
  2173. }
  2174. if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg {
  2175. return false
  2176. }
  2177. if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) {
  2178. return false
  2179. }
  2180. if p.tok.Type == ErrorToken {
  2181. return false
  2182. }
  2183. return true
  2184. }
  2185. // parseImpliedToken parses a token as though it had appeared in the parser's
  2186. // input.
  2187. func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) {
  2188. realToken, selfClosing := p.tok, p.hasSelfClosingToken
  2189. p.tok = Token{
  2190. Type: t,
  2191. DataAtom: dataAtom,
  2192. Data: data,
  2193. }
  2194. p.hasSelfClosingToken = false
  2195. p.parseCurrentToken()
  2196. p.tok, p.hasSelfClosingToken = realToken, selfClosing
  2197. }
  2198. // parseCurrentToken runs the current token through the parsing routines
  2199. // until it is consumed.
  2200. func (p *parser) parseCurrentToken() {
  2201. if p.tok.Type == SelfClosingTagToken {
  2202. p.hasSelfClosingToken = true
  2203. p.tok.Type = StartTagToken
  2204. }
  2205. consumed := false
  2206. for !consumed {
  2207. if p.inForeignContent() {
  2208. consumed = parseForeignContent(p)
  2209. } else {
  2210. consumed = p.im(p)
  2211. }
  2212. }
  2213. if p.hasSelfClosingToken {
  2214. // This is a parse error, but ignore it.
  2215. p.hasSelfClosingToken = false
  2216. }
  2217. }
  2218. func (p *parser) parse() error {
  2219. // Iterate until EOF. Any other error will cause an early return.
  2220. var err error
  2221. for err != io.EOF {
  2222. // CDATA sections are allowed only in foreign content.
  2223. n := p.oe.top()
  2224. p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")
  2225. // Read and parse the next token.
  2226. p.tokenizer.Next()
  2227. p.tok = p.tokenizer.Token()
  2228. if p.tok.Type == ErrorToken {
  2229. err = p.tokenizer.Err()
  2230. if err != nil && err != io.EOF {
  2231. return err
  2232. }
  2233. }
  2234. p.parseCurrentToken()
  2235. }
  2236. return nil
  2237. }
  2238. // Parse returns the parse tree for the HTML from the given Reader.
  2239. //
  2240. // It implements the HTML5 parsing algorithm
  2241. // (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction),
  2242. // which is very complicated. The resultant tree can contain implicitly created
  2243. // nodes that have no explicit <tag> listed in r's data, and nodes' parents can
  2244. // differ from the nesting implied by a naive processing of start and end
  2245. // <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped,
  2246. // with no corresponding node in the resulting tree.
  2247. //
  2248. // The input is assumed to be UTF-8 encoded.
  2249. func Parse(r io.Reader) (*Node, error) {
  2250. return ParseWithOptions(r)
  2251. }
  2252. // ParseFragment parses a fragment of HTML and returns the nodes that were
  2253. // found. If the fragment is the InnerHTML for an existing element, pass that
  2254. // element in context.
  2255. //
  2256. // It has the same intricacies as Parse.
  2257. func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
  2258. return ParseFragmentWithOptions(r, context)
  2259. }
  2260. // ParseOption configures a parser.
  2261. type ParseOption func(p *parser)
  2262. // ParseOptionEnableScripting configures the scripting flag.
  2263. // https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting
  2264. //
  2265. // By default, scripting is enabled.
  2266. func ParseOptionEnableScripting(enable bool) ParseOption {
  2267. return func(p *parser) {
  2268. p.scripting = enable
  2269. }
  2270. }
  2271. // ParseWithOptions is like Parse, with options.
  2272. func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) {
  2273. p := &parser{
  2274. tokenizer: NewTokenizer(r),
  2275. doc: &Node{
  2276. Type: DocumentNode,
  2277. },
  2278. scripting: true,
  2279. framesetOK: true,
  2280. im: initialIM,
  2281. }
  2282. for _, f := range opts {
  2283. f(p)
  2284. }
  2285. if err := p.parse(); err != nil {
  2286. return nil, err
  2287. }
  2288. return p.doc, nil
  2289. }
  2290. // ParseFragmentWithOptions is like ParseFragment, with options.
  2291. func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) {
  2292. contextTag := ""
  2293. if context != nil {
  2294. if context.Type != ElementNode {
  2295. return nil, errors.New("html: ParseFragment of non-element Node")
  2296. }
  2297. // The next check isn't just context.DataAtom.String() == context.Data because
  2298. // it is valid to pass an element whose tag isn't a known atom. For example,
  2299. // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent.
  2300. if context.DataAtom != a.Lookup([]byte(context.Data)) {
  2301. return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
  2302. }
  2303. contextTag = context.DataAtom.String()
  2304. }
  2305. p := &parser{
  2306. doc: &Node{
  2307. Type: DocumentNode,
  2308. },
  2309. scripting: true,
  2310. fragment: true,
  2311. context: context,
  2312. }
  2313. if context != nil && context.Namespace != "" {
  2314. p.tokenizer = NewTokenizer(r)
  2315. } else {
  2316. p.tokenizer = NewTokenizerFragment(r, contextTag)
  2317. }
  2318. for _, f := range opts {
  2319. f(p)
  2320. }
  2321. root := &Node{
  2322. Type: ElementNode,
  2323. DataAtom: a.Html,
  2324. Data: a.Html.String(),
  2325. }
  2326. p.doc.AppendChild(root)
  2327. p.oe = nodeStack{root}
  2328. if context != nil && context.DataAtom == a.Template {
  2329. p.templateStack = append(p.templateStack, inTemplateIM)
  2330. }
  2331. p.resetInsertionMode()
  2332. for n := context; n != nil; n = n.Parent {
  2333. if n.Type == ElementNode && n.DataAtom == a.Form {
  2334. p.form = n
  2335. break
  2336. }
  2337. }
  2338. if err := p.parse(); err != nil {
  2339. return nil, err
  2340. }
  2341. parent := p.doc
  2342. if context != nil {
  2343. parent = root
  2344. }
  2345. var result []*Node
  2346. for c := parent.FirstChild; c != nil; {
  2347. next := c.NextSibling
  2348. parent.RemoveChild(c)
  2349. result = append(result, c)
  2350. c = next
  2351. }
  2352. return result, nil
  2353. }