# Copyright 2005 Kevin Reid, under the terms of the MIT X license # found at http://www.opensource.org/licenses/mit-license.html ................ pragma.enable("easy-return") pragma.disable("explicit-result-guard") def mapStream := def getCharacter := # Unicode Standard, Version 4.0, Chapter 3, Section 3.10, page 78 def utf8DecodeRanges := [ # range, byte mask, following, second range [0x00..0x7F, 0x7F, 0, 0..!0], [0xC2..0xDF, 0x1F, 1, 0x80..0xBF], [0xE0..0xE0, 0x0F, 2, 0xA0..0xBF], [0xE1..0xEC, 0x0F, 2, 0x80..0xBF], [0xED..0xED, 0x0F, 2, 0x80..0x9F], [0xEE..0xEF, 0x0F, 2, 0x80..0xBF], [0xF0..0xF0, 0x07, 3, 0x90..0xBF], [0xF1..0xF3, 0x07, 3, 0x80..0xBF], [0xF4..0xF4, 0x07, 3, 0x80..0x8F]] def utf8DecodeTable := { def t := [].asMap().diverge(int, any) for [range] + info in utf8DecodeRanges { for octett in range { t[octett] := info } } t.snapshot() } def "UTF-8" { # implements CharacterEncoding, DeepFrozen to decode(octetStream, ==([].asMap())) { var bits :nullOk[int] := null var expect :nullOk[int] := null var octetRange := 0..0xFF return mapStream.chunked(char, octetStream, def utf8ChunkDecoder { to __printOn(tw :TextWriter) { tw.write("from UTF-8") } to run(inn) { # def outEstimate := inn.size() // 4 # XXX once available, this should instead be makeFlexList.(char, outEstimate) def out := [].diverge(char) for octett in inn { if (expect == null) { def [firstOctetMask, newExpect, secondOctetRange] := utf8DecodeTable.fetch(octett, thunk {throw("invalid UTF-8 sequence")}) bits := octett & firstOctetMask expect := newExpect octetRange := secondOctetRange } else { require((octett & 0xC0) == 0x80, "bad continuation") require(octetRange(octett), "out-of-range continuation") bits := (bits << 6) + (octett & 0x3F) octetRange := 0x80..0xBF expect -= 1 } if (expect == 0) { out.push(getCharacter(bits)) expect := null bits := null octetRange := 0x80..0xBF } } return __makeTwine.fromSequence(out) } }) } to encode(charStream, ==([].asMap())) { return mapStream.chunked(char, charStream, def utf8ChunkEncoder { to __printOn(tw :TextWriter) { tw.write("to UTF-8") } to run(inn) { # def outEstimate := inn.size() # XXX once available, this should instead be makeFlexList.(char, outEstimate) def out := [].diverge(int) # XXX int8 for ch in inn { switch (ch.getCodepoint()) { match one :0..0x7F \ { out.push(one) } match two :0x80..0x7FF \ { out.push(0xC0 | ( two >> 6 )) out.push(0x80 | ( two & 0x3F)) } match three :0x800..0xFFFF \ { out.push(0xE0 | ( three >> 12 )) out.push(0x80 | ((three >> 6 ) & 0x3F)) out.push(0x80 | ( three & 0x3F)) } match four :0x10000..0x10FFFF \ { out.push(0xF0 | ( four >> 18 )) out.push(0x80 | ((four >> 12) & 0x3F)) out.push(0x80 | ((four >> 6 ) & 0x3F)) out.push(0x80 | ( four & 0x3F)) } } } return out.snapshot() } }) } }