Browse Source

cif2bcif column classifier

David Sehnal 6 years ago
parent
commit
0e86ac5491

+ 43 - 14
src/apps/cif2bcif/converter.ts

@@ -8,36 +8,65 @@ import CIF, { CifCategory } from 'mol-io/reader/cif'
 import { CifWriter } from 'mol-io/writer/cif'
 import * as fs from 'fs'
 import classify from './field-classifier'
+import { Progress, Task, RuntimeContext } from 'mol-task';
 
-async function getCIF(path: string) {
+function showProgress(p: Progress) {
+    process.stdout.write(`\r${new Array(80).join(' ')}`);
+    process.stdout.write(`\r${Progress.format(p)}`);
+}
+
+async function getCIF(ctx: RuntimeContext, path: string) {
     const str = fs.readFileSync(path, 'utf8');
-    const parsed = await CIF.parseText(str).run();
+    const parsed = await CIF.parseText(str).runInContext(ctx);
     if (parsed.isError) {
         throw new Error(parsed.toString());
     }
     return parsed.result;
 }
 
-function getCategoryInstanceProvider(cat: CifCategory): CifWriter.Category.Provider {
+function getCategoryInstanceProvider(cat: CifCategory, fields: CifWriter.Field[]): CifWriter.Category.Provider {
     return function (ctx: any) {
         return {
             data: cat,
             name: cat.name,
-            fields: cat.fieldNames.map(n => classify(n, cat.getField(n)!)),
+            fields,
             rowCount: cat.rowCount
         };
     }
 }
 
-export default async function convert(path: string, asText = false) {
-    const cif = await getCIF(path);
+export default function convert(path: string, asText = false) {
+    return Task.create<Uint8Array>('BinaryCIF', async ctx => {
+        const cif = await getCIF(ctx, path);
 
-    const encoder = CifWriter.createEncoder({ binary: !asText, encoderName: 'mol* cif2bcif' });
-    for (const b of cif.blocks) {
-        encoder.startDataBlock(b.header);
-        for (const c of b.categoryNames) {
-            encoder.writeCategory(getCategoryInstanceProvider(b.categories[c]));
+        const encoder = CifWriter.createEncoder({ binary: !asText, encoderName: 'mol* cif2bcif' });
+
+        let maxProgress = 0;
+        for (const b of cif.blocks) {
+            maxProgress += b.categoryNames.length;
+            for (const c of b.categoryNames) maxProgress += b.categories[c].fieldNames.length;
         }
-    }
-    return encoder.getData();
-}
+
+        let current = 0;
+        for (const b of cif.blocks) {
+            encoder.startDataBlock(b.header);
+            for (const c of b.categoryNames) {
+                const cat = b.categories[c];
+                const fields: CifWriter.Field[] = [];
+                for (const f of cat.fieldNames) {
+                    fields.push(classify(f, cat.getField(f)!))
+                    current++;
+                    if (ctx.shouldUpdate) await ctx.update({ message: 'Encoding...', current, max: maxProgress });
+                }
+
+                encoder.writeCategory(getCategoryInstanceProvider(b.categories[c], fields));
+                current++;
+                if (ctx.shouldUpdate) await ctx.update({ message: 'Encoding...', current, max: maxProgress });
+            }
+        }
+        await ctx.update('Exporting...');
+        const ret = encoder.getData() as Uint8Array;
+        await ctx.update('Done.');
+        return ret;
+    }).run(showProgress, 250);
+}

+ 171 - 3
src/apps/cif2bcif/field-classifier.ts

@@ -7,9 +7,172 @@
 import { Column } from 'mol-data/db'
 import { CifField } from 'mol-io/reader/cif/data-model'
 import { CifWriter } from 'mol-io/writer/cif'
+import { ArrayEncoder, ArrayEncoding as E } from 'mol-io/common/binary-cif';
+
+namespace IntClassifier {
+    function packSize(value: number, upperLimit: number) {
+        return value >= 0
+            ? Math.ceil((value + 1) / upperLimit)
+            : Math.ceil((value + 1) / (-upperLimit - 1));
+    }
+
+    type IntColumnInfo = { signed: boolean, limit8: number, limit16: number };
+
+    function getInfo(data: number[]): IntColumnInfo {
+        let signed = false;
+        for (let i = 0, n = data.length; i < n; i++) {
+            if (data[i] < 0) {
+                signed = true;
+                break;
+            }
+        }
+        return signed ? { signed, limit8: 0x7F, limit16: 0x7FFF } : { signed, limit8: 0xFF, limit16: 0xFFFF };
+    }
+
+    type SizeInfo = { pack8: number, pack16: number, count: number }
+    function SizeInfo(): SizeInfo { return { pack8: 0, pack16: 0, count: 0 } };
+
+    function incSize({ limit8, limit16 }: IntColumnInfo, info: SizeInfo, value: number) {
+        info.pack8 += packSize(value, limit8);
+        info.pack16 += packSize(value, limit16);
+        info.count += 1;
+    }
+
+    function incSizeSigned(info: SizeInfo, value: number) {
+        info.pack8 += packSize(value, 0x7F);
+        info.pack16 += packSize(value, 0x7FFF);
+        info.count += 1;
+    }
+
+    function byteSize(info: SizeInfo) {
+        if (info.count * 4 < info.pack16 * 2) return { length: info.count * 4, elem: 4 };
+        if (info.pack16 * 2 < info.pack8) return { length: info.pack16 * 2, elem: 2 };
+        return { length: info.pack8, elem: 1 };
+    }
+
+    function packingSize(data: number[], info: IntColumnInfo) {
+        const size = SizeInfo();
+        for (let i = 0, n = data.length; i < n; i++) {
+            incSize(info, size, data[i]);
+        }
+        return { ...byteSize(size), kind: 'pack' };
+    }
+
+    function deltaSize(data: number[], info: IntColumnInfo) {
+        const size = SizeInfo();
+        let prev = data[0];
+        for (let i = 1, n = data.length; i < n; i++) {
+            incSizeSigned(size, data[i] - prev);
+            prev = data[i];
+        }
+        return { ...byteSize(size), kind: 'delta' };
+    }
+
+    function rleSize(data: number[], info: IntColumnInfo) {
+        const size = SizeInfo();
+        let run = 1;
+        for (let i = 1, n = data.length; i < n; i++) {
+            if (data[i - 1] !== data[i]) {
+                incSize(info, size, data[i - 1]);
+                incSize(info, size, run);
+                run = 1;
+            } else {
+                run++;
+            }
+        }
+        incSize(info, size, data[data.length - 1]);
+        incSize(info, size, run);
+
+        return { ...byteSize(size), kind: 'rle' };
+    }
+
+    function deltaRleSize(data: number[], info: IntColumnInfo) {
+        const size = SizeInfo();
+        let run = 1, prev = 0, prevValue = 0;
+        for (let i = 1, n = data.length; i < n; i++) {
+            const v = data[i] - prev;
+            if (prevValue !== v) {
+                incSizeSigned(size, prevValue);
+                incSizeSigned(size, run);
+                run = 1;
+            } else {
+                run++;
+            }
+            prevValue = v;
+            prev = data[i];
+        }
+        incSizeSigned(size, prevValue);
+        incSizeSigned(size, run);
+
+        return { ...byteSize(size), kind: 'delta-rle' };
+    }
+
+    export function getSize(data: number[]) {
+        const info = getInfo(data);
+        const sizes = [packingSize(data, info), rleSize(data, info), deltaSize(data, info), deltaRleSize(data, info)];
+        sizes.sort((a, b) => a.length - b.length);
+        return sizes;
+    }
+
+    export function classify(data: number[], name: string): ArrayEncoder {
+        if (data.length < 2) return E.by(E.byteArray);
+
+        const sizes = getSize(data);
+        const size = sizes[0];
+        // console.log(`${name}: ${size.kind} ${size.length}b ${data.length}`);
+        // console.log(`${name}: ${sizes.map(s => `${s.kind}: ${s.length}b`).join(' | ')}`);
+
+        switch (size.kind) {
+            case 'pack': return E.by(E.integerPacking);
+            case 'rle': return E.by(E.runLength).and(E.integerPacking);
+            case 'delta': return E.by(E.delta).and(E.integerPacking);
+            case 'delta-rle': return E.by(E.delta).and(E.runLength).and(E.integerPacking);
+        }
+
+        throw 'bug';
+    }
+}
+
+namespace FloatClassifier {
+    const delta = 1e-6;
+    function digitCount(v: number) {
+        let m = 1;
+        for (let i = 0; i < 5; i++) {
+            const r = Math.round(m * v) / m;
+            if (Math.abs(v - r) < delta) return m;
+            m *= 10;
+        }
+        return 10000;
+    }
+
+    export function classify(data: number[], name: string) {
+        let dc = 10;
+        for (let i = 0, n = data.length; i < n; i++) dc = Math.max(dc, digitCount(data[i]));
+
+        if (dc >= 10000) return { encoder: E.by(E.byteArray), typedArray: Float64Array };
+
+        const intArray = new Int32Array(data.length);
+        for (let i = 0, n = data.length; i < n; i++) intArray[i] = data[i] * dc;
+
+        const sizes = IntClassifier.getSize(intArray as any);
+        const size = sizes[0];
+
+        // console.log(`>> ${name}: ${size.kind} ${size.length}b ${data.length} x${dc}`);
+        // console.log(`   ${name}: ${sizes.map(s => `${s.kind}: ${s.length}b`).join(' | ')}`);
+
+        switch (size.kind) {
+            case 'pack': return { encoder: E.by(E.fixedPoint(dc)).and(E.integerPacking), typedArray: Float32Array };
+            case 'rle': return { encoder: E.by(E.fixedPoint(dc)).and(E.runLength).and(E.integerPacking), typedArray: Float32Array };
+            case 'delta': return { encoder: E.by(E.fixedPoint(dc)).and(E.delta).and(E.integerPacking), typedArray: Float32Array };
+            case 'delta-rle': return { encoder: E.by(E.fixedPoint(dc)).and(E.delta).and(E.runLength).and(E.integerPacking), typedArray: Float32Array };
+        }
+
+        throw 'bug';
+    }
+}
 
 const intRegex = /^-?\d+$/
-const floatRegex = /^-?(([0-9]+)[.]?|([0-9]*[.][0-9]+))([(][0-9]+[)])?([eE][+-]?[0-9]+)?/
+const floatRegex = /^-?(([0-9]+)[.]?|([0-9]*[.][0-9]+))([(][0-9]+[)])?([eE][+-]?[0-9]+)?$/
 
 // Classify a cif field as str, int or float based the data it contains.
 // To classify a field as int or float all items are checked.
@@ -25,8 +188,13 @@ function classify(name: string, field: CifField): CifWriter.Field {
     }
 
     if (hasString) return { name, type: CifWriter.Field.Type.Str, value: field.str, valueKind: field.valueKind };
-    if (floatCount > 0) return { name, type: CifWriter.Field.Type.Float, value: field.float, valueKind: field.valueKind };
-    return { name, type: CifWriter.Field.Type.Int, value: field.int, valueKind: field.valueKind };
+    if (floatCount > 0) {
+        const { encoder, typedArray } = FloatClassifier.classify(field.toFloatArray({ array: Float64Array }) as number[], name)
+        return CifWriter.Field.float(name, field.float, { valueKind: field.valueKind, encoder, typedArray });
+    } else {
+        const encoder = IntClassifier.classify(field.toIntArray({ array: Int32Array }) as number[], name);
+        return CifWriter.Field.int(name, field.int, { valueKind: field.valueKind, encoder, typedArray: Int32Array });
+    }
 }
 
 export default classify;

+ 2 - 1
src/mol-task/execution/observable.ts

@@ -174,6 +174,7 @@ class ObservableRuntimeContext implements RuntimeContext {
         const progress = this.node.progress;
         if (typeof update === 'string') {
             progress.message = update;
+            progress.isIndeterminate = true;
         } else {
             if (typeof update.canAbort !== 'undefined') progress.canAbort = update.canAbort;
             if (typeof update.message !== 'undefined') progress.message = update.message;
@@ -193,7 +194,7 @@ class ObservableRuntimeContext implements RuntimeContext {
         this.lastUpdatedTime = now();
         this.updateProgress(progress);
 
-        if (!!dontNotify || !shouldNotify(this.info, this.lastUpdatedTime)) return;
+        if (!!dontNotify /*|| !shouldNotify(this.info, this.lastUpdatedTime)*/) return;
 
         notifyObserver(this.info, this.lastUpdatedTime);