103 lines
2.4 KiB
JavaScript

/**
* https://encoding.spec.whatwg.org/#utf-8-decoder
*/
module.exports = class UTF8Decoder {
constructor () {
this.codePoint = 0
this.bytesSeen = 0
this.bytesNeeded = 0
this.lowerBoundary = 0x80
this.upperBoundary = 0xbf
}
get remaining () {
return this.bytesSeen
}
decode (data) {
// If we have a fast path, just sniff if the last part is a boundary
if (this.bytesNeeded === 0) {
let isBoundary = true
for (let i = Math.max(0, data.byteLength - 4), n = data.byteLength; i < n && isBoundary; i++) {
isBoundary = data[i] <= 0x7f
}
if (isBoundary) return data.toString()
}
let result = ''
for (let i = 0, n = data.byteLength; i < n; i++) {
const byte = data[i]
if (this.bytesNeeded === 0) {
if (byte <= 0x7f) {
result += String.fromCharCode(byte)
} else {
this.bytesSeen = 1
if (byte >= 0xc2 && byte <= 0xdf) {
this.bytesNeeded = 2
this.codePoint = byte & 0x1f
} else if (byte >= 0xe0 && byte <= 0xef) {
if (byte === 0xe0) this.lowerBoundary = 0xa0
else if (byte === 0xed) this.upperBoundary = 0x9f
this.bytesNeeded = 3
this.codePoint = byte & 0xf
} else if (byte >= 0xf0 && byte <= 0xf4) {
if (byte === 0xf0) this.lowerBoundary = 0x90
if (byte === 0xf4) this.upperBoundary = 0x8f
this.bytesNeeded = 4
this.codePoint = byte & 0x7
} else {
result += '\ufffd'
}
}
continue
}
if (byte < this.lowerBoundary || byte > this.upperBoundary) {
this.codePoint = 0
this.bytesNeeded = 0
this.bytesSeen = 0
this.lowerBoundary = 0x80
this.upperBoundary = 0xbf
result += '\ufffd'
continue
}
this.lowerBoundary = 0x80
this.upperBoundary = 0xbf
this.codePoint = (this.codePoint << 6) | (byte & 0x3f)
this.bytesSeen++
if (this.bytesSeen !== this.bytesNeeded) continue
result += String.fromCodePoint(this.codePoint)
this.codePoint = 0
this.bytesNeeded = 0
this.bytesSeen = 0
}
return result
}
flush () {
const result = this.bytesNeeded > 0 ? '\ufffd' : ''
this.codePoint = 0
this.bytesNeeded = 0
this.bytesSeen = 0
this.lowerBoundary = 0x80
this.upperBoundary = 0xbf
return result
}
}