sax.js 37 KB


  1. // wrapper for non-node envs
  2. ;(function (sax) {
  3. sax.parser = function (strict, opt) { return new SAXParser(strict, opt) }
  4. sax.SAXParser = SAXParser
  5. sax.SAXStream = SAXStream
  6. sax.createStream = createStream
  7. // When we pass the MAX_BUFFER_LENGTH position, start checking for buffer overruns.
  8. // When we check, schedule the next check for MAX_BUFFER_LENGTH - (max(buffer lengths)),
  9. // since that's the earliest that a buffer overrun could occur. This way, checks are
  10. // as rare as required, but as often as necessary to ensure never crossing this bound.
  11. // Furthermore, buffers are only tested at most once per write(), so passing a very
  12. // large string into write() might have undesirable effects, but this is manageable by
  13. // the caller, so it is assumed to be safe. Thus, a call to write() may, in the extreme
  14. // edge case, result in creating at most one complete copy of the string passed in.
  15. // Set to Infinity to have unlimited buffers.
  16. sax.MAX_BUFFER_LENGTH = 64 * 1024
  17. var buffers = [
  18. "comment", "sgmlDecl", "textNode", "tagName", "doctype",
  19. "procInstName", "procInstBody", "entity", "attribName",
  20. "attribValue", "cdata", "script"
  21. ]
  22. sax.EVENTS = // for discoverability.
  23. [ "text"
  24. , "processinginstruction"
  25. , "sgmldeclaration"
  26. , "doctype"
  27. , "comment"
  28. , "attribute"
  29. , "opentag"
  30. , "closetag"
  31. , "opencdata"
  32. , "cdata"
  33. , "closecdata"
  34. , "error"
  35. , "end"
  36. , "ready"
  37. , "script"
  38. , "opennamespace"
  39. , "closenamespace"
  40. ]
  41. function SAXParser (strict, opt) {
  42. if (!(this instanceof SAXParser)) return new SAXParser(strict, opt)
  43. var parser = this
  44. clearBuffers(parser)
  45. parser.q = parser.c = ""
  46. parser.bufferCheckPosition = sax.MAX_BUFFER_LENGTH
  47. parser.opt = opt || {}
  48. parser.opt.lowercase = parser.opt.lowercase || parser.opt.lowercasetags
  49. parser.looseCase = parser.opt.lowercase ? "toLowerCase" : "toUpperCase"
  50. parser.tags = []
  51. parser.closed = parser.closedRoot = parser.sawRoot = false
  52. parser.tag = parser.error = null
  53. parser.strict = !!strict
  54. parser.noscript = !!(strict || parser.opt.noscript)
  55. parser.state = S.BEGIN
  56. parser.ENTITIES = Object.create(sax.ENTITIES)
  57. parser.attribList = []
  58. // namespaces form a prototype chain.
  59. // it always points at the current tag,
  60. // which protos to its parent tag.
  61. if (parser.opt.xmlns) parser.ns = Object.create(rootNS)
  62. // mostly just for error reporting
  63. parser.trackPosition = parser.opt.position !== false
  64. if (parser.trackPosition) {
  65. parser.position = parser.line = parser.column = 0
  66. }
  67. emit(parser, "onready")
  68. }
  69. if (!Object.create) Object.create = function (o) {
  70. function f () { this.__proto__ = o }
  71. f.prototype = o
  72. return new f
  73. }
  74. if (!Object.getPrototypeOf) Object.getPrototypeOf = function (o) {
  75. return o.__proto__
  76. }
  77. if (!Object.keys) Object.keys = function (o) {
  78. var a = []
  79. for (var i in o) if (o.hasOwnProperty(i)) a.push(i)
  80. return a
  81. }
  82. function checkBufferLength (parser) {
  83. var maxAllowed = Math.max(sax.MAX_BUFFER_LENGTH, 10)
  84. , maxActual = 0
  85. for (var i = 0, l = buffers.length; i < l; i ++) {
  86. var len = parser[buffers[i]].length
  87. if (len > maxAllowed) {
  88. // Text/cdata nodes can get big, and since they're buffered,
  89. // we can get here under normal conditions.
  90. // Avoid issues by emitting the text node now,
  91. // so at least it won't get any bigger.
  92. switch (buffers[i]) {
  93. case "textNode":
  94. closeText(parser)
  95. break
  96. case "cdata":
  97. emitNode(parser, "oncdata", parser.cdata)
  98. parser.cdata = ""
  99. break
  100. case "script":
  101. emitNode(parser, "onscript", parser.script)
  102. parser.script = ""
  103. break
  104. default:
  105. error(parser, "Max buffer length exceeded: "+buffers[i])
  106. }
  107. }
  108. maxActual = Math.max(maxActual, len)
  109. }
  110. // schedule the next check for the earliest possible buffer overrun.
  111. parser.bufferCheckPosition = (sax.MAX_BUFFER_LENGTH - maxActual)
  112. + parser.position
  113. }
  114. function clearBuffers (parser) {
  115. for (var i = 0, l = buffers.length; i < l; i ++) {
  116. parser[buffers[i]] = ""
  117. }
  118. }
  119. function flushBuffers (parser) {
  120. closeText(parser)
  121. if (parser.cdata !== "") {
  122. emitNode(parser, "oncdata", parser.cdata)
  123. parser.cdata = ""
  124. }
  125. if (parser.script !== "") {
  126. emitNode(parser, "onscript", parser.script)
  127. parser.script = ""
  128. }
  129. }
  130. SAXParser.prototype =
  131. { end: function () { end(this) }
  132. , write: write
  133. , resume: function () { this.error = null; return this }
  134. , close: function () { return this.write(null) }
  135. , flush: function () { flushBuffers(this) }
  136. }
  137. try {
  138. var Stream = require("stream").Stream
  139. } catch (ex) {
  140. var Stream = function () {}
  141. }
  142. var streamWraps = sax.EVENTS.filter(function (ev) {
  143. return ev !== "error" && ev !== "end"
  144. })
  145. function createStream (strict, opt) {
  146. return new SAXStream(strict, opt)
  147. }
  148. function SAXStream (strict, opt) {
  149. if (!(this instanceof SAXStream)) return new SAXStream(strict, opt)
  150. Stream.apply(this)
  151. this._parser = new SAXParser(strict, opt)
  152. this.writable = true
  153. this.readable = true
  154. var me = this
  155. this._parser.onend = function () {
  156. me.emit("end")
  157. }
  158. this._parser.onerror = function (er) {
  159. me.emit("error", er)
  160. // if didn't throw, then means error was handled.
  161. // go ahead and clear error, so we can write again.
  162. me._parser.error = null
  163. }
  164. this._decoder = null;
  165. streamWraps.forEach(function (ev) {
  166. Object.defineProperty(me, "on" + ev, {
  167. get: function () { return me._parser["on" + ev] },
  168. set: function (h) {
  169. if (!h) {
  170. me.removeAllListeners(ev)
  171. return me._parser["on"+ev] = h
  172. }
  173. me.on(ev, h)
  174. },
  175. enumerable: true,
  176. configurable: false
  177. })
  178. })
  179. }
  180. SAXStream.prototype = Object.create(Stream.prototype,
  181. { constructor: { value: SAXStream } })
  182. SAXStream.prototype.write = function (data) {
  183. if (typeof Buffer === 'function' &&
  184. typeof Buffer.isBuffer === 'function' &&
  185. Buffer.isBuffer(data)) {
  186. if (!this._decoder) {
  187. var SD = require('string_decoder').StringDecoder
  188. this._decoder = new SD('utf8')
  189. }
  190. data = this._decoder.write(data);
  191. }
  192. this._parser.write(data.toString())
  193. this.emit("data", data)
  194. return true
  195. }
  196. SAXStream.prototype.end = function (chunk) {
  197. if (chunk && chunk.length) this.write(chunk)
  198. this._parser.end()
  199. return true
  200. }
  201. SAXStream.prototype.on = function (ev, handler) {
  202. var me = this
  203. if (!me._parser["on"+ev] && streamWraps.indexOf(ev) !== -1) {
  204. me._parser["on"+ev] = function () {
  205. var args = arguments.length === 1 ? [arguments[0]]
  206. : Array.apply(null, arguments)
  207. args.splice(0, 0, ev)
  208. me.emit.apply(me, args)
  209. }
  210. }
  211. return Stream.prototype.on.call(me, ev, handler)
  212. }
  213. // character classes and tokens
  214. var whitespace = "\r\n\t "
  215. // this really needs to be replaced with character classes.
  216. // XML allows all manner of ridiculous numbers and digits.
  217. , number = "0124356789"
  218. , letter = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
  219. // (Letter | "_" | ":")
  220. , quote = "'\""
  221. , entity = number+letter+"#"
  222. , attribEnd = whitespace + ">"
  223. , CDATA = "[CDATA["
  224. , DOCTYPE = "DOCTYPE"
  225. , XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
  226. , XMLNS_NAMESPACE = "http://www.w3.org/2000/xmlns/"
  227. , rootNS = { xml: XML_NAMESPACE, xmlns: XMLNS_NAMESPACE }
  228. // turn all the string character sets into character class objects.
  229. whitespace = charClass(whitespace)
  230. number = charClass(number)
  231. letter = charClass(letter)
  232. // http://www.w3.org/TR/REC-xml/#NT-NameStartChar
  233. // This implementation works on strings, a single character at a time
  234. // as such, it cannot ever support astral-plane characters (10000-EFFFF)
  235. // without a significant breaking change to either this parser, or the
  236. // JavaScript language. Implementation of an emoji-capable xml parser
  237. // is left as an exercise for the reader.
  238. var nameStart = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]/
  239. var nameBody = /[:_A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\u00B7\u0300-\u036F\u203F-\u2040\.\d-]/
  240. quote = charClass(quote)
  241. entity = charClass(entity)
  242. attribEnd = charClass(attribEnd)
  243. function charClass (str) {
  244. return str.split("").reduce(function (s, c) {
  245. s[c] = true
  246. return s
  247. }, {})
  248. }
  249. function isRegExp (c) {
  250. return Object.prototype.toString.call(c) === '[object RegExp]'
  251. }
  252. function is (charclass, c) {
  253. return isRegExp(charclass) ? !!c.match(charclass) : charclass[c]
  254. }
  255. function not (charclass, c) {
  256. return !is(charclass, c)
  257. }
  258. var S = 0
  259. sax.STATE =
  260. { BEGIN : S++
  261. , TEXT : S++ // general stuff
  262. , TEXT_ENTITY : S++ // &amp and such.
  263. , OPEN_WAKA : S++ // <
  264. , SGML_DECL : S++ // <!BLARG
  265. , SGML_DECL_QUOTED : S++ // <!BLARG foo "bar
  266. , DOCTYPE : S++ // <!DOCTYPE
  267. , DOCTYPE_QUOTED : S++ // <!DOCTYPE "//blah
  268. , DOCTYPE_DTD : S++ // <!DOCTYPE "//blah" [ ...
  269. , DOCTYPE_DTD_QUOTED : S++ // <!DOCTYPE "//blah" [ "foo
  270. , COMMENT_STARTING : S++ // <!-
  271. , COMMENT : S++ // <!--
  272. , COMMENT_ENDING : S++ // <!-- blah -
  273. , COMMENT_ENDED : S++ // <!-- blah --
  274. , CDATA : S++ // <![CDATA[ something
  275. , CDATA_ENDING : S++ // ]
  276. , CDATA_ENDING_2 : S++ // ]]
  277. , PROC_INST : S++ // <?hi
  278. , PROC_INST_BODY : S++ // <?hi there
  279. , PROC_INST_ENDING : S++ // <?hi "there" ?
  280. , OPEN_TAG : S++ // <strong
  281. , OPEN_TAG_SLASH : S++ // <strong /
  282. , ATTRIB : S++ // <a
  283. , ATTRIB_NAME : S++ // <a foo
  284. , ATTRIB_NAME_SAW_WHITE : S++ // <a foo _
  285. , ATTRIB_VALUE : S++ // <a foo=
  286. , ATTRIB_VALUE_QUOTED : S++ // <a foo="bar
  287. , ATTRIB_VALUE_CLOSED : S++ // <a foo="bar"
  288. , ATTRIB_VALUE_UNQUOTED : S++ // <a foo=bar
  289. , ATTRIB_VALUE_ENTITY_Q : S++ // <foo bar="&quot;"
  290. , ATTRIB_VALUE_ENTITY_U : S++ // <foo bar=&quot;
  291. , CLOSE_TAG : S++ // </a
  292. , CLOSE_TAG_SAW_WHITE : S++ // </a >
  293. , SCRIPT : S++ // <script> ...
  294. , SCRIPT_ENDING : S++ // <script> ... <
  295. }
  296. sax.ENTITIES =
  297. { "amp" : "&"
  298. , "gt" : ">"
  299. , "lt" : "<"
  300. , "quot" : "\""
  301. , "apos" : "'"
  302. , "AElig" : 198
  303. , "Aacute" : 193
  304. , "Acirc" : 194
  305. , "Agrave" : 192
  306. , "Aring" : 197
  307. , "Atilde" : 195
  308. , "Auml" : 196
  309. , "Ccedil" : 199
  310. , "ETH" : 208
  311. , "Eacute" : 201
  312. , "Ecirc" : 202
  313. , "Egrave" : 200
  314. , "Euml" : 203
  315. , "Iacute" : 205
  316. , "Icirc" : 206
  317. , "Igrave" : 204
  318. , "Iuml" : 207
  319. , "Ntilde" : 209
  320. , "Oacute" : 211
  321. , "Ocirc" : 212
  322. , "Ograve" : 210
  323. , "Oslash" : 216
  324. , "Otilde" : 213
  325. , "Ouml" : 214
  326. , "THORN" : 222
  327. , "Uacute" : 218
  328. , "Ucirc" : 219
  329. , "Ugrave" : 217
  330. , "Uuml" : 220
  331. , "Yacute" : 221
  332. , "aacute" : 225
  333. , "acirc" : 226
  334. , "aelig" : 230
  335. , "agrave" : 224
  336. , "aring" : 229
  337. , "atilde" : 227
  338. , "auml" : 228
  339. , "ccedil" : 231
  340. , "eacute" : 233
  341. , "ecirc" : 234
  342. , "egrave" : 232
  343. , "eth" : 240
  344. , "euml" : 235
  345. , "iacute" : 237
  346. , "icirc" : 238
  347. , "igrave" : 236
  348. , "iuml" : 239
  349. , "ntilde" : 241
  350. , "oacute" : 243
  351. , "ocirc" : 244
  352. , "ograve" : 242
  353. , "oslash" : 248
  354. , "otilde" : 245
  355. , "ouml" : 246
  356. , "szlig" : 223
  357. , "thorn" : 254
  358. , "uacute" : 250
  359. , "ucirc" : 251
  360. , "ugrave" : 249
  361. , "uuml" : 252
  362. , "yacute" : 253
  363. , "yuml" : 255
  364. , "copy" : 169
  365. , "reg" : 174
  366. , "nbsp" : 160
  367. , "iexcl" : 161
  368. , "cent" : 162
  369. , "pound" : 163
  370. , "curren" : 164
  371. , "yen" : 165
  372. , "brvbar" : 166
  373. , "sect" : 167
  374. , "uml" : 168
  375. , "ordf" : 170
  376. , "laquo" : 171
  377. , "not" : 172
  378. , "shy" : 173
  379. , "macr" : 175
  380. , "deg" : 176
  381. , "plusmn" : 177
  382. , "sup1" : 185
  383. , "sup2" : 178
  384. , "sup3" : 179
  385. , "acute" : 180
  386. , "micro" : 181
  387. , "para" : 182
  388. , "middot" : 183
  389. , "cedil" : 184
  390. , "ordm" : 186
  391. , "raquo" : 187
  392. , "frac14" : 188
  393. , "frac12" : 189
  394. , "frac34" : 190
  395. , "iquest" : 191
  396. , "times" : 215
  397. , "divide" : 247
  398. , "OElig" : 338
  399. , "oelig" : 339
  400. , "Scaron" : 352
  401. , "scaron" : 353
  402. , "Yuml" : 376
  403. , "fnof" : 402
  404. , "circ" : 710
  405. , "tilde" : 732
  406. , "Alpha" : 913
  407. , "Beta" : 914
  408. , "Gamma" : 915
  409. , "Delta" : 916
  410. , "Epsilon" : 917
  411. , "Zeta" : 918
  412. , "Eta" : 919
  413. , "Theta" : 920
  414. , "Iota" : 921
  415. , "Kappa" : 922
  416. , "Lambda" : 923
  417. , "Mu" : 924
  418. , "Nu" : 925
  419. , "Xi" : 926
  420. , "Omicron" : 927
  421. , "Pi" : 928
  422. , "Rho" : 929
  423. , "Sigma" : 931
  424. , "Tau" : 932
  425. , "Upsilon" : 933
  426. , "Phi" : 934
  427. , "Chi" : 935
  428. , "Psi" : 936
  429. , "Omega" : 937
  430. , "alpha" : 945
  431. , "beta" : 946
  432. , "gamma" : 947
  433. , "delta" : 948
  434. , "epsilon" : 949
  435. , "zeta" : 950
  436. , "eta" : 951
  437. , "theta" : 952
  438. , "iota" : 953
  439. , "kappa" : 954
  440. , "lambda" : 955
  441. , "mu" : 956
  442. , "nu" : 957
  443. , "xi" : 958
  444. , "omicron" : 959
  445. , "pi" : 960
  446. , "rho" : 961
  447. , "sigmaf" : 962
  448. , "sigma" : 963
  449. , "tau" : 964
  450. , "upsilon" : 965
  451. , "phi" : 966
  452. , "chi" : 967
  453. , "psi" : 968
  454. , "omega" : 969
  455. , "thetasym" : 977
  456. , "upsih" : 978
  457. , "piv" : 982
  458. , "ensp" : 8194
  459. , "emsp" : 8195
  460. , "thinsp" : 8201
  461. , "zwnj" : 8204
  462. , "zwj" : 8205
  463. , "lrm" : 8206
  464. , "rlm" : 8207
  465. , "ndash" : 8211
  466. , "mdash" : 8212
  467. , "lsquo" : 8216
  468. , "rsquo" : 8217
  469. , "sbquo" : 8218
  470. , "ldquo" : 8220
  471. , "rdquo" : 8221
  472. , "bdquo" : 8222
  473. , "dagger" : 8224
  474. , "Dagger" : 8225
  475. , "bull" : 8226
  476. , "hellip" : 8230
  477. , "permil" : 8240
  478. , "prime" : 8242
  479. , "Prime" : 8243
  480. , "lsaquo" : 8249
  481. , "rsaquo" : 8250
  482. , "oline" : 8254
  483. , "frasl" : 8260
  484. , "euro" : 8364
  485. , "image" : 8465
  486. , "weierp" : 8472
  487. , "real" : 8476
  488. , "trade" : 8482
  489. , "alefsym" : 8501
  490. , "larr" : 8592
  491. , "uarr" : 8593
  492. , "rarr" : 8594
  493. , "darr" : 8595
  494. , "harr" : 8596
  495. , "crarr" : 8629
  496. , "lArr" : 8656
  497. , "uArr" : 8657
  498. , "rArr" : 8658
  499. , "dArr" : 8659
  500. , "hArr" : 8660
  501. , "forall" : 8704
  502. , "part" : 8706
  503. , "exist" : 8707
  504. , "empty" : 8709
  505. , "nabla" : 8711
  506. , "isin" : 8712
  507. , "notin" : 8713
  508. , "ni" : 8715
  509. , "prod" : 8719
  510. , "sum" : 8721
  511. , "minus" : 8722
  512. , "lowast" : 8727
  513. , "radic" : 8730
  514. , "prop" : 8733
  515. , "infin" : 8734
  516. , "ang" : 8736
  517. , "and" : 8743
  518. , "or" : 8744
  519. , "cap" : 8745
  520. , "cup" : 8746
  521. , "int" : 8747
  522. , "there4" : 8756
  523. , "sim" : 8764
  524. , "cong" : 8773
  525. , "asymp" : 8776
  526. , "ne" : 8800
  527. , "equiv" : 8801
  528. , "le" : 8804
  529. , "ge" : 8805
  530. , "sub" : 8834
  531. , "sup" : 8835
  532. , "nsub" : 8836
  533. , "sube" : 8838
  534. , "supe" : 8839
  535. , "oplus" : 8853
  536. , "otimes" : 8855
  537. , "perp" : 8869
  538. , "sdot" : 8901
  539. , "lceil" : 8968
  540. , "rceil" : 8969
  541. , "lfloor" : 8970
  542. , "rfloor" : 8971
  543. , "lang" : 9001
  544. , "rang" : 9002
  545. , "loz" : 9674
  546. , "spades" : 9824
  547. , "clubs" : 9827
  548. , "hearts" : 9829
  549. , "diams" : 9830
  550. }
  551. Object.keys(sax.ENTITIES).forEach(function (key) {
  552. var e = sax.ENTITIES[key]
  553. var s = typeof e === 'number' ? String.fromCharCode(e) : e
  554. sax.ENTITIES[key] = s
  555. })
  556. for (var S in sax.STATE) sax.STATE[sax.STATE[S]] = S
  557. // shorthand
  558. S = sax.STATE
  559. function emit (parser, event, data) {
  560. parser[event] && parser[event](data)
  561. }
  562. function emitNode (parser, nodeType, data) {
  563. if (parser.textNode) closeText(parser)
  564. emit(parser, nodeType, data)
  565. }
  566. function closeText (parser) {
  567. parser.textNode = textopts(parser.opt, parser.textNode)
  568. if (parser.textNode) emit(parser, "ontext", parser.textNode)
  569. parser.textNode = ""
  570. }
  571. function textopts (opt, text) {
  572. if (opt.trim) text = text.trim()
  573. if (opt.normalize) text = text.replace(/\s+/g, " ")
  574. return text
  575. }
  576. function error (parser, er) {
  577. closeText(parser)
  578. if (parser.trackPosition) {
  579. er += "\nLine: "+parser.line+
  580. "\nColumn: "+parser.column+
  581. "\nChar: "+parser.c
  582. }
  583. er = new Error(er)
  584. parser.error = er
  585. emit(parser, "onerror", er)
  586. return parser
  587. }
  588. function end (parser) {
  589. if (!parser.closedRoot) strictFail(parser, "Unclosed root tag")
  590. if ((parser.state !== S.BEGIN) && (parser.state !== S.TEXT)) error(parser, "Unexpected end")
  591. closeText(parser)
  592. parser.c = ""
  593. parser.closed = true
  594. emit(parser, "onend")
  595. SAXParser.call(parser, parser.strict, parser.opt)
  596. return parser
  597. }
  598. function strictFail (parser, message) {
  599. if (typeof parser !== 'object' || !(parser instanceof SAXParser))
  600. throw new Error('bad call to strictFail');
  601. if (parser.strict) error(parser, message)
  602. }
  603. function newTag (parser) {
  604. if (!parser.strict) parser.tagName = parser.tagName[parser.looseCase]()
  605. var parent = parser.tags[parser.tags.length - 1] || parser
  606. , tag = parser.tag = { name : parser.tagName, attributes : {} }
  607. // will be overridden if tag contails an xmlns="foo" or xmlns:foo="bar"
  608. if (parser.opt.xmlns) tag.ns = parent.ns
  609. parser.attribList.length = 0
  610. }
  611. function qname (name, attribute) {
  612. var i = name.indexOf(":")
  613. , qualName = i < 0 ? [ "", name ] : name.split(":")
  614. , prefix = qualName[0]
  615. , local = qualName[1]
  616. // <x "xmlns"="http://foo">
  617. if (attribute && name === "xmlns") {
  618. prefix = "xmlns"
  619. local = ""
  620. }
  621. return { prefix: prefix, local: local }
  622. }
  623. function attrib (parser) {
  624. if (!parser.strict) parser.attribName = parser.attribName[parser.looseCase]()
  625. if (parser.attribList.indexOf(parser.attribName) !== -1 ||
  626. parser.tag.attributes.hasOwnProperty(parser.attribName)) {
  627. return parser.attribName = parser.attribValue = ""
  628. }
  629. if (parser.opt.xmlns) {
  630. var qn = qname(parser.attribName, true)
  631. , prefix = qn.prefix
  632. , local = qn.local
  633. if (prefix === "xmlns") {
  634. // namespace binding attribute; push the binding into scope
  635. if (local === "xml" && parser.attribValue !== XML_NAMESPACE) {
  636. strictFail( parser
  637. , "xml: prefix must be bound to " + XML_NAMESPACE + "\n"
  638. + "Actual: " + parser.attribValue )
  639. } else if (local === "xmlns" && parser.attribValue !== XMLNS_NAMESPACE) {
  640. strictFail( parser
  641. , "xmlns: prefix must be bound to " + XMLNS_NAMESPACE + "\n"
  642. + "Actual: " + parser.attribValue )
  643. } else {
  644. var tag = parser.tag
  645. , parent = parser.tags[parser.tags.length - 1] || parser
  646. if (tag.ns === parent.ns) {
  647. tag.ns = Object.create(parent.ns)
  648. }
  649. tag.ns[local] = parser.attribValue
  650. }
  651. }
  652. // defer onattribute events until all attributes have been seen
  653. // so any new bindings can take effect; preserve attribute order
  654. // so deferred events can be emitted in document order
  655. parser.attribList.push([parser.attribName, parser.attribValue])
  656. } else {
  657. // in non-xmlns mode, we can emit the event right away
  658. parser.tag.attributes[parser.attribName] = parser.attribValue
  659. emitNode( parser
  660. , "onattribute"
  661. , { name: parser.attribName
  662. , value: parser.attribValue } )
  663. }
  664. parser.attribName = parser.attribValue = ""
  665. }
  666. function openTag (parser, selfClosing) {
  667. if (parser.opt.xmlns) {
  668. // emit namespace binding events
  669. var tag = parser.tag
  670. // add namespace info to tag
  671. var qn = qname(parser.tagName)
  672. tag.prefix = qn.prefix
  673. tag.local = qn.local
  674. tag.uri = tag.ns[qn.prefix] || ""
  675. if (tag.prefix && !tag.uri) {
  676. strictFail(parser, "Unbound namespace prefix: "
  677. + JSON.stringify(parser.tagName))
  678. tag.uri = qn.prefix
  679. }
  680. var parent = parser.tags[parser.tags.length - 1] || parser
  681. if (tag.ns && parent.ns !== tag.ns) {
  682. Object.keys(tag.ns).forEach(function (p) {
  683. emitNode( parser
  684. , "onopennamespace"
  685. , { prefix: p , uri: tag.ns[p] } )
  686. })
  687. }
  688. // handle deferred onattribute events
  689. // Note: do not apply default ns to attributes:
  690. // http://www.w3.org/TR/REC-xml-names/#defaulting
  691. for (var i = 0, l = parser.attribList.length; i < l; i ++) {
  692. var nv = parser.attribList[i]
  693. var name = nv[0]
  694. , value = nv[1]
  695. , qualName = qname(name, true)
  696. , prefix = qualName.prefix
  697. , local = qualName.local
  698. , uri = prefix == "" ? "" : (tag.ns[prefix] || "")
  699. , a = { name: name
  700. , value: value
  701. , prefix: prefix
  702. , local: local
  703. , uri: uri
  704. }
  705. // if there's any attributes with an undefined namespace,
  706. // then fail on them now.
  707. if (prefix && prefix != "xmlns" && !uri) {
  708. strictFail(parser, "Unbound namespace prefix: "
  709. + JSON.stringify(prefix))
  710. a.uri = prefix
  711. }
  712. parser.tag.attributes[name] = a
  713. emitNode(parser, "onattribute", a)
  714. }
  715. parser.attribList.length = 0
  716. }
  717. parser.tag.isSelfClosing = !!selfClosing
  718. // process the tag
  719. parser.sawRoot = true
  720. parser.tags.push(parser.tag)
  721. emitNode(parser, "onopentag", parser.tag)
  722. if (!selfClosing) {
  723. // special case for <script> in non-strict mode.
  724. if (!parser.noscript && parser.tagName.toLowerCase() === "script") {
  725. parser.state = S.SCRIPT
  726. } else {
  727. parser.state = S.TEXT
  728. }
  729. parser.tag = null
  730. parser.tagName = ""
  731. }
  732. parser.attribName = parser.attribValue = ""
  733. parser.attribList.length = 0
  734. }
  735. function closeTag (parser) {
  736. if (!parser.tagName) {
  737. strictFail(parser, "Weird empty close tag.")
  738. parser.textNode += "</>"
  739. parser.state = S.TEXT
  740. return
  741. }
  742. if (parser.script) {
  743. if (parser.tagName !== "script") {
  744. parser.script += "</" + parser.tagName + ">"
  745. parser.tagName = ""
  746. parser.state = S.SCRIPT
  747. return
  748. }
  749. emitNode(parser, "onscript", parser.script)
  750. parser.script = ""
  751. }
  752. // first make sure that the closing tag actually exists.
  753. // <a><b></c></b></a> will close everything, otherwise.
  754. var t = parser.tags.length
  755. var tagName = parser.tagName
  756. if (!parser.strict) tagName = tagName[parser.looseCase]()
  757. var closeTo = tagName
  758. while (t --) {
  759. var close = parser.tags[t]
  760. if (close.name !== closeTo) {
  761. // fail the first time in strict mode
  762. strictFail(parser, "Unexpected close tag")
  763. } else break
  764. }
  765. // didn't find it. we already failed for strict, so just abort.
  766. if (t < 0) {
  767. strictFail(parser, "Unmatched closing tag: "+parser.tagName)
  768. parser.textNode += "</" + parser.tagName + ">"
  769. parser.state = S.TEXT
  770. return
  771. }
  772. parser.tagName = tagName
  773. var s = parser.tags.length
  774. while (s --> t) {
  775. var tag = parser.tag = parser.tags.pop()
  776. parser.tagName = parser.tag.name
  777. emitNode(parser, "onclosetag", parser.tagName)
  778. var x = {}
  779. for (var i in tag.ns) x[i] = tag.ns[i]
  780. var parent = parser.tags[parser.tags.length - 1] || parser
  781. if (parser.opt.xmlns && tag.ns !== parent.ns) {
  782. // remove namespace bindings introduced by tag
  783. Object.keys(tag.ns).forEach(function (p) {
  784. var n = tag.ns[p]
  785. emitNode(parser, "onclosenamespace", { prefix: p, uri: n })
  786. })
  787. }
  788. }
  789. if (t === 0) parser.closedRoot = true
  790. parser.tagName = parser.attribValue = parser.attribName = ""
  791. parser.attribList.length = 0
  792. parser.state = S.TEXT
  793. }
  794. function parseEntity (parser) {
  795. var entity = parser.entity
  796. , entityLC = entity.toLowerCase()
  797. , num
  798. , numStr = ""
  799. if (parser.ENTITIES[entity])
  800. return parser.ENTITIES[entity]
  801. if (parser.ENTITIES[entityLC])
  802. return parser.ENTITIES[entityLC]
  803. entity = entityLC
  804. if (entity.charAt(0) === "#") {
  805. if (entity.charAt(1) === "x") {
  806. entity = entity.slice(2)
  807. num = parseInt(entity, 16)
  808. numStr = num.toString(16)
  809. } else {
  810. entity = entity.slice(1)
  811. num = parseInt(entity, 10)
  812. numStr = num.toString(10)
  813. }
  814. }
  815. entity = entity.replace(/^0+/, "")
  816. if (numStr.toLowerCase() !== entity) {
  817. strictFail(parser, "Invalid character entity")
  818. return "&"+parser.entity + ";"
  819. }
  820. return String.fromCharCode(num)
  821. }
  822. function write (chunk) {
  823. var parser = this
  824. if (this.error) throw this.error
  825. if (parser.closed) return error(parser,
  826. "Cannot write after close. Assign an onready handler.")
  827. if (chunk === null) return end(parser)
  828. var i = 0, c = ""
  829. while (parser.c = c = chunk.charAt(i++)) {
  830. if (parser.trackPosition) {
  831. parser.position ++
  832. if (c === "\n") {
  833. parser.line ++
  834. parser.column = 0
  835. } else parser.column ++
  836. }
  837. switch (parser.state) {
  838. case S.BEGIN:
  839. if (c === "<") {
  840. parser.state = S.OPEN_WAKA
  841. parser.startTagPosition = parser.position
  842. } else if (not(whitespace,c)) {
  843. // have to process this as a text node.
  844. // weird, but happens.
  845. strictFail(parser, "Non-whitespace before first tag.")
  846. parser.textNode = c
  847. parser.state = S.TEXT
  848. }
  849. continue
  850. case S.TEXT:
  851. if (parser.sawRoot && !parser.closedRoot) {
  852. var starti = i-1
  853. while (c && c!=="<" && c!=="&") {
  854. c = chunk.charAt(i++)
  855. if (c && parser.trackPosition) {
  856. parser.position ++
  857. if (c === "\n") {
  858. parser.line ++
  859. parser.column = 0
  860. } else parser.column ++
  861. }
  862. }
  863. parser.textNode += chunk.substring(starti, i-1)
  864. }
  865. if (c === "<") {
  866. parser.state = S.OPEN_WAKA
  867. parser.startTagPosition = parser.position
  868. } else {
  869. if (not(whitespace, c) && (!parser.sawRoot || parser.closedRoot))
  870. strictFail(parser, "Text data outside of root node.")
  871. if (c === "&") parser.state = S.TEXT_ENTITY
  872. else parser.textNode += c
  873. }
  874. continue
  875. case S.SCRIPT:
  876. // only non-strict
  877. if (c === "<") {
  878. parser.state = S.SCRIPT_ENDING
  879. } else parser.script += c
  880. continue
  881. case S.SCRIPT_ENDING:
  882. if (c === "/") {
  883. parser.state = S.CLOSE_TAG
  884. } else {
  885. parser.script += "<" + c
  886. parser.state = S.SCRIPT
  887. }
  888. continue
  889. case S.OPEN_WAKA:
  890. // either a /, ?, !, or text is coming next.
  891. if (c === "!") {
  892. parser.state = S.SGML_DECL
  893. parser.sgmlDecl = ""
  894. } else if (is(whitespace, c)) {
  895. // wait for it...
  896. } else if (is(nameStart,c)) {
  897. parser.state = S.OPEN_TAG
  898. parser.tagName = c
  899. } else if (c === "/") {
  900. parser.state = S.CLOSE_TAG
  901. parser.tagName = ""
  902. } else if (c === "?") {
  903. parser.state = S.PROC_INST
  904. parser.procInstName = parser.procInstBody = ""
  905. } else {
  906. strictFail(parser, "Unencoded <")
  907. // if there was some whitespace, then add that in.
  908. if (parser.startTagPosition + 1 < parser.position) {
  909. var pad = parser.position - parser.startTagPosition
  910. c = new Array(pad).join(" ") + c
  911. }
  912. parser.textNode += "<" + c
  913. parser.state = S.TEXT
  914. }
  915. continue
  916. case S.SGML_DECL:
  917. if ((parser.sgmlDecl+c).toUpperCase() === CDATA) {
  918. emitNode(parser, "onopencdata")
  919. parser.state = S.CDATA
  920. parser.sgmlDecl = ""
  921. parser.cdata = ""
  922. } else if (parser.sgmlDecl+c === "--") {
  923. parser.state = S.COMMENT
  924. parser.comment = ""
  925. parser.sgmlDecl = ""
  926. } else if ((parser.sgmlDecl+c).toUpperCase() === DOCTYPE) {
  927. parser.state = S.DOCTYPE
  928. if (parser.doctype || parser.sawRoot) strictFail(parser,
  929. "Inappropriately located doctype declaration")
  930. parser.doctype = ""
  931. parser.sgmlDecl = ""
  932. } else if (c === ">") {
  933. emitNode(parser, "onsgmldeclaration", parser.sgmlDecl)
  934. parser.sgmlDecl = ""
  935. parser.state = S.TEXT
  936. } else if (is(quote, c)) {
  937. parser.state = S.SGML_DECL_QUOTED
  938. parser.sgmlDecl += c
  939. } else parser.sgmlDecl += c
  940. continue
  941. case S.SGML_DECL_QUOTED:
  942. if (c === parser.q) {
  943. parser.state = S.SGML_DECL
  944. parser.q = ""
  945. }
  946. parser.sgmlDecl += c
  947. continue
  948. case S.DOCTYPE:
  949. if (c === ">") {
  950. parser.state = S.TEXT
  951. emitNode(parser, "ondoctype", parser.doctype)
  952. parser.doctype = true // just remember that we saw it.
  953. } else {
  954. parser.doctype += c
  955. if (c === "[") parser.state = S.DOCTYPE_DTD
  956. else if (is(quote, c)) {
  957. parser.state = S.DOCTYPE_QUOTED
  958. parser.q = c
  959. }
  960. }
  961. continue
  962. case S.DOCTYPE_QUOTED:
  963. parser.doctype += c
  964. if (c === parser.q) {
  965. parser.q = ""
  966. parser.state = S.DOCTYPE
  967. }
  968. continue
  969. case S.DOCTYPE_DTD:
  970. parser.doctype += c
  971. if (c === "]") parser.state = S.DOCTYPE
  972. else if (is(quote,c)) {
  973. parser.state = S.DOCTYPE_DTD_QUOTED
  974. parser.q = c
  975. }
  976. continue
  977. case S.DOCTYPE_DTD_QUOTED:
  978. parser.doctype += c
  979. if (c === parser.q) {
  980. parser.state = S.DOCTYPE_DTD
  981. parser.q = ""
  982. }
  983. continue
  984. case S.COMMENT:
  985. if (c === "-") parser.state = S.COMMENT_ENDING
  986. else parser.comment += c
  987. continue
  988. case S.COMMENT_ENDING:
  989. if (c === "-") {
  990. parser.state = S.COMMENT_ENDED
  991. parser.comment = textopts(parser.opt, parser.comment)
  992. if (parser.comment) emitNode(parser, "oncomment", parser.comment)
  993. parser.comment = ""
  994. } else {
  995. parser.comment += "-" + c
  996. parser.state = S.COMMENT
  997. }
  998. continue
  999. case S.COMMENT_ENDED:
  1000. if (c !== ">") {
  1001. strictFail(parser, "Malformed comment")
  1002. // allow <!-- blah -- bloo --> in non-strict mode,
  1003. // which is a comment of " blah -- bloo "
  1004. parser.comment += "--" + c
  1005. parser.state = S.COMMENT
  1006. } else parser.state = S.TEXT
  1007. continue
  1008. case S.CDATA:
  1009. if (c === "]") parser.state = S.CDATA_ENDING
  1010. else parser.cdata += c
  1011. continue
  1012. case S.CDATA_ENDING:
  1013. if (c === "]") parser.state = S.CDATA_ENDING_2
  1014. else {
  1015. parser.cdata += "]" + c
  1016. parser.state = S.CDATA
  1017. }
  1018. continue
  1019. case S.CDATA_ENDING_2:
  1020. if (c === ">") {
  1021. if (parser.cdata) emitNode(parser, "oncdata", parser.cdata)
  1022. emitNode(parser, "onclosecdata")
  1023. parser.cdata = ""
  1024. parser.state = S.TEXT
  1025. } else if (c === "]") {
  1026. parser.cdata += "]"
  1027. } else {
  1028. parser.cdata += "]]" + c
  1029. parser.state = S.CDATA
  1030. }
  1031. continue
  1032. case S.PROC_INST:
  1033. if (c === "?") parser.state = S.PROC_INST_ENDING
  1034. else if (is(whitespace, c)) parser.state = S.PROC_INST_BODY
  1035. else parser.procInstName += c
  1036. continue
  1037. case S.PROC_INST_BODY:
  1038. if (!parser.procInstBody && is(whitespace, c)) continue
  1039. else if (c === "?") parser.state = S.PROC_INST_ENDING
  1040. else parser.procInstBody += c
  1041. continue
  1042. case S.PROC_INST_ENDING:
  1043. if (c === ">") {
  1044. emitNode(parser, "onprocessinginstruction", {
  1045. name : parser.procInstName,
  1046. body : parser.procInstBody
  1047. })
  1048. parser.procInstName = parser.procInstBody = ""
  1049. parser.state = S.TEXT
  1050. } else {
  1051. parser.procInstBody += "?" + c
  1052. parser.state = S.PROC_INST_BODY
  1053. }
  1054. continue
  1055. case S.OPEN_TAG:
  1056. if (is(nameBody, c)) parser.tagName += c
  1057. else {
  1058. newTag(parser)
  1059. if (c === ">") openTag(parser)
  1060. else if (c === "/") parser.state = S.OPEN_TAG_SLASH
  1061. else {
  1062. if (not(whitespace, c)) strictFail(
  1063. parser, "Invalid character in tag name")
  1064. parser.state = S.ATTRIB
  1065. }
  1066. }
  1067. continue
  1068. case S.OPEN_TAG_SLASH:
  1069. if (c === ">") {
  1070. openTag(parser, true)
  1071. closeTag(parser)
  1072. } else {
  1073. strictFail(parser, "Forward-slash in opening tag not followed by >")
  1074. parser.state = S.ATTRIB
  1075. }
  1076. continue
  1077. case S.ATTRIB:
  1078. // haven't read the attribute name yet.
  1079. if (is(whitespace, c)) continue
  1080. else if (c === ">") openTag(parser)
  1081. else if (c === "/") parser.state = S.OPEN_TAG_SLASH
  1082. else if (is(nameStart, c)) {
  1083. parser.attribName = c
  1084. parser.attribValue = ""
  1085. parser.state = S.ATTRIB_NAME
  1086. } else strictFail(parser, "Invalid attribute name")
  1087. continue
  1088. case S.ATTRIB_NAME:
  1089. if (c === "=") parser.state = S.ATTRIB_VALUE
  1090. else if (c === ">") {
  1091. strictFail(parser, "Attribute without value")
  1092. parser.attribValue = parser.attribName
  1093. attrib(parser)
  1094. openTag(parser)
  1095. }
  1096. else if (is(whitespace, c)) parser.state = S.ATTRIB_NAME_SAW_WHITE
  1097. else if (is(nameBody, c)) parser.attribName += c
  1098. else strictFail(parser, "Invalid attribute name")
  1099. continue
  1100. case S.ATTRIB_NAME_SAW_WHITE:
  1101. if (c === "=") parser.state = S.ATTRIB_VALUE
  1102. else if (is(whitespace, c)) continue
  1103. else {
  1104. strictFail(parser, "Attribute without value")
  1105. parser.tag.attributes[parser.attribName] = ""
  1106. parser.attribValue = ""
  1107. emitNode(parser, "onattribute",
  1108. { name : parser.attribName, value : "" })
  1109. parser.attribName = ""
  1110. if (c === ">") openTag(parser)
  1111. else if (is(nameStart, c)) {
  1112. parser.attribName = c
  1113. parser.state = S.ATTRIB_NAME
  1114. } else {
  1115. strictFail(parser, "Invalid attribute name")
  1116. parser.state = S.ATTRIB
  1117. }
  1118. }
  1119. continue
  1120. case S.ATTRIB_VALUE:
  1121. if (is(whitespace, c)) continue
  1122. else if (is(quote, c)) {
  1123. parser.q = c
  1124. parser.state = S.ATTRIB_VALUE_QUOTED
  1125. } else {
  1126. strictFail(parser, "Unquoted attribute value")
  1127. parser.state = S.ATTRIB_VALUE_UNQUOTED
  1128. parser.attribValue = c
  1129. }
  1130. continue
  1131. case S.ATTRIB_VALUE_QUOTED:
  1132. if (c !== parser.q) {
  1133. if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_Q
  1134. else parser.attribValue += c
  1135. continue
  1136. }
  1137. attrib(parser)
  1138. parser.q = ""
  1139. parser.state = S.ATTRIB_VALUE_CLOSED
  1140. continue
  1141. case S.ATTRIB_VALUE_CLOSED:
  1142. if (is(whitespace, c)) {
  1143. parser.state = S.ATTRIB
  1144. } else if (c === ">") openTag(parser)
  1145. else if (c === "/") parser.state = S.OPEN_TAG_SLASH
  1146. else if (is(nameStart, c)) {
  1147. strictFail(parser, "No whitespace between attributes")
  1148. parser.attribName = c
  1149. parser.attribValue = ""
  1150. parser.state = S.ATTRIB_NAME
  1151. } else strictFail(parser, "Invalid attribute name")
  1152. continue
  1153. case S.ATTRIB_VALUE_UNQUOTED:
  1154. if (not(attribEnd,c)) {
  1155. if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_U
  1156. else parser.attribValue += c
  1157. continue
  1158. }
  1159. attrib(parser)
  1160. if (c === ">") openTag(parser)
  1161. else parser.state = S.ATTRIB
  1162. continue
  1163. case S.CLOSE_TAG:
  1164. if (!parser.tagName) {
  1165. if (is(whitespace, c)) continue
  1166. else if (not(nameStart, c)) {
  1167. if (parser.script) {
  1168. parser.script += "</" + c
  1169. parser.state = S.SCRIPT
  1170. } else {
  1171. strictFail(parser, "Invalid tagname in closing tag.")
  1172. }
  1173. } else parser.tagName = c
  1174. }
  1175. else if (c === ">") closeTag(parser)
  1176. else if (is(nameBody, c)) parser.tagName += c
  1177. else if (parser.script) {
  1178. parser.script += "</" + parser.tagName
  1179. parser.tagName = ""
  1180. parser.state = S.SCRIPT
  1181. } else {
  1182. if (not(whitespace, c)) strictFail(parser,
  1183. "Invalid tagname in closing tag")
  1184. parser.state = S.CLOSE_TAG_SAW_WHITE
  1185. }
  1186. continue
  1187. case S.CLOSE_TAG_SAW_WHITE:
  1188. if (is(whitespace, c)) continue
  1189. if (c === ">") closeTag(parser)
  1190. else strictFail(parser, "Invalid characters in closing tag")
  1191. continue
  1192. case S.TEXT_ENTITY:
  1193. case S.ATTRIB_VALUE_ENTITY_Q:
  1194. case S.ATTRIB_VALUE_ENTITY_U:
  1195. switch(parser.state) {
  1196. case S.TEXT_ENTITY:
  1197. var returnState = S.TEXT, buffer = "textNode"
  1198. break
  1199. case S.ATTRIB_VALUE_ENTITY_Q:
  1200. var returnState = S.ATTRIB_VALUE_QUOTED, buffer = "attribValue"
  1201. break
  1202. case S.ATTRIB_VALUE_ENTITY_U:
  1203. var returnState = S.ATTRIB_VALUE_UNQUOTED, buffer = "attribValue"
  1204. break
  1205. }
  1206. if (c === ";") {
  1207. parser[buffer] += parseEntity(parser)
  1208. parser.entity = ""
  1209. parser.state = returnState
  1210. }
  1211. else if (is(entity, c)) parser.entity += c
  1212. else {
  1213. strictFail(parser, "Invalid character entity")
  1214. parser[buffer] += "&" + parser.entity + c
  1215. parser.entity = ""
  1216. parser.state = returnState
  1217. }
  1218. continue
  1219. default:
  1220. throw new Error(parser, "Unknown state: " + parser.state)
  1221. }
  1222. } // while
  1223. // cdata blocks can get very big under normal conditions. emit and move on.
  1224. // if (parser.state === S.CDATA && parser.cdata) {
  1225. // emitNode(parser, "oncdata", parser.cdata)
  1226. // parser.cdata = ""
  1227. // }
  1228. if (parser.position >= parser.bufferCheckPosition) checkBufferLength(parser)
  1229. return parser
  1230. }
  1231. })(typeof exports === "undefined" ? sax = {} : exports)