General Update

An update on the current progress of projects and general things here at KRT. I’ve set about checking out TypeScript for using in projects. It looks good, has some hidden pitfalls on findingĀ .m.ts files for underscore for example, but in general looks good. I’m running it over some JS to get more of a feel. The audio VST project is moving slowly, at oscillators at the moment, with filters being done. I am looking into cache coherence algorithms and strategies to ease hardware design at the moment too. The 68k2 document mentioned in previous post is expanding with some of these ideas in having a “stall on value match” register, with a “touch since changed” bit in each cache line.

All good.

The Processor Design Document in Progress

TypeScript

Well I eventually managed to get a file using _.reduce() to compile without errors now. I’ll test it as soon as I’ve adapted in QUnit 2.0.1 so I can write my tests to the build as a pop up window, an perhaps back load a file to then be able to save the file from within the editor, and hence to become parser frame.

Representation

An excerpt from the 68k2 document as it’s progressing. An idea on UTF8 easy indexing and expansion.

“Reducing the size of this indexing array can recursively use the same technique, as long as movement between length encodings is not traversed for long sequences. This would require adding in a 2 length (11 bit form) and a 3 length (16 bit form) of common punctuation and spacing. Surrogate pair just postpones the issue and moves cache occupation to 25%, and not quite that for speed efficiency. This is why the simplified Chinese is common circa 2017, and surrogate processing has been abandoned in the Unicode specification, and replaced by characters in the surrogate representation space. Hand drawing the surrogates was likely the issue, and character parts (as individual parts) with double strike was considered a better rendering option.

UTF8 therefore has a possible 17 bit rendering for due to the extra bit freed by not needing a UTF32 representation. Should this be glyph space, or skip code index space, or a mix? 16 bit purity says skip code space. With common length (2 bit) and count (14 bit), allowing skips of between 16 kB and 48 kB through a document. The 4th combination of length? Perhaps the representation of the common punctuation without character length alterations. For 512 specials in the 2 length form and 65536 specials in the 3 length forms. In UTF16 there would be issues of decode, and uniqueness. This perhaps is best tackled by some render form meta characters in the original Unicode space. There is no way around it, and with skips maybe UTF8 would be faster.”


// tool.js 1.1.1
// https://kring.co.uk
// (c) 2016-2017 Simon Jackson, K Ring Technologies Ltd
// MIT, like as he said. And underscored :D

import * as _ from 'underscore';

//==============================================================================
// LZW-compress a string
//==============================================================================
// The bounce parameter if true adds extra entries for faster dictionary growth.
// Usually LZW dictionary grows sub linear on input chars, and it is of note
// that after a BWT, the phrase contains a good MTF estimate and so maybe fine
// to append each of its chars to many dictionary entries. In this way the
// growth of entries becomes "almost" linear. The dictionary memory foot print
// becomes quadratic. Short to medium inputs become even smaller. Long input
// lengths may become slightly larger on not using dictionary entries integrated
// over input length, but will most likely be slightly smaller.

// DO NOT USE bounce (=false) IF NO BWT BEFORE.
// Under these conditions many unused dictionary entries will be wasted on long
// highly redundant inputs. It is a feature for pre BWT packed PONs.
//===============================================================================
function encodeLZW(data: string, bounce: boolean): string {
var dict = {};
data = encodeSUTF(data);
var out = [];
var currChar;
var phrase = data[0];
var codeL = 0;
var code = 256;
for (var i=1; i<data.length; i++) {
currChar=data[i];
if (dict['_' + phrase + currChar] != null) {
phrase += currChar;
}
else {
out.push(codeL = phrase.length > 1 ? dict['_'+phrase] : phrase.charCodeAt(0));
if(code < 65536) {//limit
dict['_' + phrase + currChar] = code;
code++;
if(bounce && codeL != code - 2) {//code -- and one before would be last symbol out
_.each(phrase.split(''), function (chr) {
if(code < 65536) {
while(dict['_' + phrase + chr]) phrase += chr;
dict['_' + phrase + chr] = code;
code++;
}
});
}
}
phrase=currChar;
}
}
out.push(phrase.length > 1 ? dict['_'+phrase] : phrase.charCodeAt(0));
for (var i=0; i<out.length; i++) {
out[i] = String.fromCharCode(out[i]);
}
return out.join();
}

function encodeSUTF(s: string): string {
s = encodeUTF(s);
var out = [];
var msb: number = 0;
var two: boolean = false;
var first: boolean = true;
_.each(s, function(val) {
var k = val.charCodeAt(0);
if(k > 127) {
if (first == true) {
first = false;
two = (k & 32) == 0;
if (k == msb) return;
msb = k;
} else {
if (two == true) two = false;
else first = true;
}
}
out.push(String.fromCharCode(k));
});
return out.join();
}

function encodeBounce(s: string): string {
return encodeLZW(s, true);
}

//=================================================
// Decompress an LZW-encoded string
//=================================================
function decodeLZW(s: string, bounce: boolean): string {
var dict = {};
var dictI = {};
var data = (s + '').split('');
var currChar = data[0];
var oldPhrase = currChar;
var out = [currChar];
var code = 256;
var phrase;
for (var i=1; i<data.length; i++) {
var currCode = data[i].charCodeAt(0);
if (currCode < 256) {
phrase = data[i];
}
else {
phrase = dict['_'+currCode] ? dict['_'+currCode] : (oldPhrase + currChar);
}
out.push(phrase);
currChar = phrase.charAt(0);
if(code < 65536) {
dict['_'+code] = oldPhrase + currChar;
dictI['_' + oldPhrase + currChar] = code;
code++;
if(bounce && !dict['_'+currCode]) {//the special lag
_.each(oldPhrase.split(''), function (chr) {
if(code < 65536) {
while(dictI['_' + oldPhrase + chr]) oldPhrase += chr;
dict['_' + code] = oldPhrase + chr;
dictI['_' + oldPhrase + chr] = code;
code++;
}
});
}
}
oldPhrase = phrase;
}
return decodeSUTF(out.join(''));
}

function decodeSUTF(s: string): string {
var out = [];
var msb: number = 0;
var make: number = 0;
var from: number = 0;
_.each(s, function(val, idx) {
var k = val.charCodeAt(0);
if (k > 127) {
if (idx < from + make) return;
if ((k & 128) != 0) {
msb = k;
make = (k & 64) == 0 ? 2 : 3;
from = idx + 1;
} else {
from = idx;
}
out.push(String.fromCharCode(msb));
for (var i = from; i < from + make; i++) {
out.push(s[i]);
}
return;
} else {
out.push(String.fromCharCode(k));
}
});
return decodeUTF(out.join());
}

function decodeBounce(s: string): string {
return decodeLZW(s, true);
}

//=================================================
// UTF mangling with ArrayBuffer mappings
//=================================================
declare function escape(s: string): string;
declare function unescape(s: string): string;

function encodeUTF(s: string): string {
return unescape(encodeURIComponent(s));
}

function decodeUTF(s: string): string {
return decodeURIComponent(escape(s));
}

function toBuffer(str: string): ArrayBuffer {
var arr = encodeSUTF(str);
var buf = new ArrayBuffer(arr.length);
var bufView = new Uint8Array(buf);
for (var i = 0, arrLen = arr.length; i < arrLen; i++) {
bufView[i] = arr[i].charCodeAt(0);
}
return buf;
}

function fromBuffer(buf: ArrayBuffer): string {
var out: string = '';
var bufView = new Uint8Array(buf);
for (var i = 0, arrLen = bufView.length; i < arrLen; i++) {
out += String.fromCharCode(bufView[i]);
}
return decodeSUTF(out);
}

//===============================================
//A Burrows Wheeler Transform of strings
//===============================================
function encodeBWT(data: string): any {
var size = data.length;
var buff = data + data;
var idx = _.range(size).sort(function(x, y){
for (var i = 0; i < size; i++) {
var r = buff[x + i].charCodeAt(0) - buff[y + i].charCodeAt(0);
if (r !== 0) return r;
}
return 0;
});

var top: number;
var work = _.reduce(_.range(size), function(memo, k: number) {
var p = idx[k];
if (p === 0) top = k;
memo.push(buff[p + size - 1]);
return memo;
}, []).join('');

return { top: top, data: work };
}

function decodeBWT(top: number, data: string): string { //JSON

var size = data.length;
var idx = _.range(size).sort(function(x, y){
var c = data[x].charCodeAt(0) - data[y].charCodeAt(0);
if (c === 0) return x - y;
return c;
});

var p = idx[top];
return _.reduce(_.range(size), function(memo){
memo.push(data[p]);
p = idx[p];
return memo;
}, []).join('');
}

//==================================================
// Two functions to do a dictionary effectiveness
// split of what to compress. This has the effect
// of applying an effective dictionary size bigger
// than would otherwise be.
//==================================================
function tally(data: string): number[] {
return _.reduce(data.split(''), function (memo: number[], charAt: string): number[] {
memo[charAt.charCodeAt(0)]++;//increase
return memo;
}, []);
}

function splice(data: string): string[] {
var acc = 0;
var counts = tally(data);
return _.reduce(counts, function(memo, count: number, key) {
memo.push(key + data.substring(acc, count + acc));
/* adds a seek char:
This assists in DB seek performance as it's the ordering char for the lzw block */
acc += count;
}, []);
}

//=====================================================
// A packer and unpacker with good efficiency
//=====================================================
// These are the ones to call, and the rest sre maybe
// useful, but can be considered as foundations for
// these functions. some block length management is
// built in.
function pack(data: any): any {
//limits
var str = JSON.stringify(data);
var chain = {};
if(str.length > 524288) {
chain = pack(str.substring(524288));
str = str.substring(0, 524288);
}
var bwt = encodeBWT(str);
var mix = splice(bwt.data);

mix = _.map(mix, encodeBounce);
return {
top: bwt.top,
/* tally: encode_tally(tally), */
mix: mix,
chn: chain
};
}

function unpack(got: any): any {
var top: number = got.top || 0;
/* var tally = got.tally; */
var mix: string[] = got.mix || [];

mix = _.map(mix, decodeBounce);
var mixr: string = _.reduce(mix, function(memo: string, lzw: string): string {
/* var key = lzw.charAt(0);//get seek char */
memo += lzw.substring(1, lzw.length);//concat
return memo;
}, '');
var chain = got.chn;
var res = decodeBWT(top, mixr);
if(_.has(chain, 'chn')) {
res += unpack(chain.chn);
}
return JSON.parse(res);
}