# Copyright 2005-2007 Kevin Reid, under the terms of the MIT X license # found at http://www.opensource.org/licenses/mit-license.html ................ pragma.syntax("0.9") def transformIn :DeepFrozen := def getCharacter :DeepFrozen := # Unicode Standard, Version 4.0, Chapter 3, Section 3.10, page 78 def utf8DecodeRanges :DeepFrozen := [ # range, byte mask, following, second range [0x00..0x7F, 0x7F, 0, 0..!0], [0xC2..0xDF, 0x1F, 1, 0x80..0xBF], [0xE0..0xE0, 0x0F, 2, 0xA0..0xBF], [0xE1..0xEC, 0x0F, 2, 0x80..0xBF], [0xED..0xED, 0x0F, 2, 0x80..0x9F], [0xEE..0xEF, 0x0F, 2, 0x80..0xBF], [0xF0..0xF0, 0x07, 3, 0x90..0xBF], [0xF1..0xF3, 0x07, 3, 0x80..0xBF], [0xF4..0xF4, 0x07, 3, 0x80..0x8F]] def utf8DecodeTable :DeepFrozen := { def t := [].asMap().diverge(int, any) for [range] + info in utf8DecodeRanges { for octett in range { t[octett] := info } } t.snapshot() } def anyContinuation :DeepFrozen := 0x80..0xBF # These would really be better inline; moving them to outer level is optimization. XXX Improve the compiler etc. to be able to hoist/constant fold these, and put them back at their usage sites def EncodedBy1 :DeepFrozen := 0..0x7f def EncodedBy2 :DeepFrozen := 0x80..0x7FF def EncodedBy3 :DeepFrozen := 0x800..0xFFFF def EncodedBy4 :DeepFrozen := 0x10000..0x10FFFF def ::"UTF-8" implements DeepFrozen, ExitViaHere { # implements CharacterEncoding to decode(octetStream, ==([].asMap())) { var bits :nullOk[int] := null var expect :nullOk[int] := null var octetRange := 0..0xFF return transformIn(String, def utf8ChunkDecoder { to __printOn(tw :TextWriter) { tw.write("from UTF-8") } to run(inn) { # def outEstimate := inn.size() // 4 # XXX once available, this should instead be makeFlexList.(char, outEstimate) def out := [].diverge(char) for octett in inn { if (expect == null) { def [firstOctetMask, newExpect, secondOctetRange] := utf8DecodeTable.fetch(octett, fn {throw("invalid UTF-8 sequence")}) bits := octett & firstOctetMask expect := newExpect octetRange := secondOctetRange } else { require((octett & 0xC0) == 0x80, "bad continuation") require(octetRange(octett), "out-of-range continuation") bits := (bits << 6) + (octett & 0x3F) octetRange := anyContinuation expect -= 1 } if (expect == 0) { out.push(getCharacter(bits)) expect := null bits := null octetRange := anyContinuation } } return __makeTwine.fromValuesOf(out) } }, octetStream) } to encode(charStream, ==([].asMap())) { return transformIn(List[int], def utf8ChunkEncoder { to __printOn(tw :TextWriter) { tw.write("to UTF-8") } to run(inn) { # def outEstimate := inn.size() # XXX once available, this should instead be makeFlexList.(char, outEstimate) def out := [].diverge(int) # XXX int8 for ch in inn { switch (ch.getCodepoint()) { match one :EncodedBy1 \ { out.push(one) } match two :EncodedBy2 \ { out.push(0xC0 | ( two >> 6 )) out.push(0x80 | ( two & 0x3F)) } match three :EncodedBy3 \ { out.push(0xE0 | ( three >> 12 )) out.push(0x80 | ((three >> 6 ) & 0x3F)) out.push(0x80 | ( three & 0x3F)) } match four :EncodedBy4 \ { out.push(0xF0 | ( four >> 18 )) out.push(0x80 | ((four >> 12) & 0x3F)) out.push(0x80 | ((four >> 6 ) & 0x3F)) out.push(0x80 | ( four & 0x3F)) } } } return out.snapshot() } }, charStream) } }