Alexander Rose 7 лет назад
Родитель
Сommit
c852ad434b

+ 1 - 1
src/mol-data/db/column.ts

@@ -139,7 +139,7 @@ namespace Column {
         return columnIndicesOf(c, test);
     }
 
-    /** Makes the column backned by an array. Useful for columns that accessed often. */
+    /** Makes the column backed by an array. Useful for columns that are accessed often. */
     export function asArrayColumn<T>(c: Column<T>, array?: ArrayCtor<T>): Column<T> {
         if (c['@array']) return c;
         if (!c.isDefined) return Undefined(c.rowCount, c.schema) as any as Column<T>;

+ 80 - 0
src/mol-io/reader/_spec/csv.spec.ts

@@ -0,0 +1,80 @@
+/**
+ * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Alexander Rose <alexander.rose@weirdbyte.de>
+ */
+
+import Csv from '../csv/parser'
+
+const csvStringBasic = `StrCol,IntCol,FloatCol
+# comment
+string1,-1,-0.34e3
+string2,42,2.44`
+
+const csvStringAdvanced = `StrCol,"Int Col",FloatCol
+ string1  \t , -1,  -0.34e3
+    # comment
+   " stri
+ng2" ,42, 2.44 `
+
+const tabString = `StrCol\tIntCol\tFloatCol
+string1\t-1\t-0.34e3
+string2\t42\t2.44`
+
+describe('csv reader', () => {
+    it('basic', async () => {
+        const parsed = await Csv(csvStringBasic)();
+        if (parsed.isError) return;
+        const csvFile = parsed.result;
+
+        // csvFile.table.columnNames.forEach(name => {
+        //     const col = csvFile.table.getColumn(name)
+        //     if (col) console.log(name, col.toStringArray())
+        // })
+
+        const strCol = csvFile.table.getColumn('StrCol')
+        if (strCol) expect(strCol.toStringArray()).toEqual(['string1', 'string2'])
+
+        const intCol = csvFile.table.getColumn('IntCol')
+        if (intCol) expect(intCol.toIntArray()).toEqual([-1, 42])
+
+        const floatCol = csvFile.table.getColumn('FloatCol')
+        if (floatCol) expect(floatCol.toFloatArray()).toEqual([-340.0, 2.44])
+
+        expect.assertions(3)
+    });
+
+    it('advanced', async () => {
+        const parsed = await Csv(csvStringAdvanced)();
+        if (parsed.isError) return;
+        const csvFile = parsed.result;
+
+        const strCol = csvFile.table.getColumn('StrCol')
+        if (strCol) expect(strCol.toStringArray()).toEqual(['string1', ' stri\nng2'])
+
+        const intCol = csvFile.table.getColumn('Int Col')
+        if (intCol) expect(intCol.toIntArray()).toEqual([-1, 42])
+
+        const floatCol = csvFile.table.getColumn('FloatCol')
+        if (floatCol) expect(floatCol.toFloatArray()).toEqual([-340.0, 2.44])
+
+        expect.assertions(3)
+    });
+
+    it('tabs', async () => {
+        const parsed = await Csv(tabString, { delimiter: '\t' })();
+        if (parsed.isError) return;
+        const csvFile = parsed.result;
+
+        const strCol = csvFile.table.getColumn('StrCol')
+        if (strCol) expect(strCol.toStringArray()).toEqual(['string1', 'string2'])
+
+        const intCol = csvFile.table.getColumn('IntCol')
+        if (intCol) expect(intCol.toIntArray()).toEqual([-1, 42])
+
+        const floatCol = csvFile.table.getColumn('FloatCol')
+        if (floatCol) expect(floatCol.toFloatArray()).toEqual([-340.0, 2.44])
+
+        expect.assertions(3)
+    });
+});

+ 9 - 0
src/mol-io/reader/common/text/tokenizer.ts

@@ -41,6 +41,14 @@ export namespace Tokenizer {
         return state.data.substring(state.tokenStart, state.tokenEnd);
     }
 
+    /** Resets the state */
+    export function reset (state: Tokenizer) {
+        state.position = 0
+        state.lineNumber = 1
+        state.tokenStart = 0
+        state.tokenEnd = 0
+    }
+
     /**
      * Eat everything until a newline occurs.
      */
@@ -227,6 +235,7 @@ export namespace TokenBuilder {
     }
 
     export function create(tokenizer: Tokenizer, size: number): Tokens {
+        size = Math.max(10, size)
         return <Builder>{
             data: tokenizer.data,
             indicesLenMinus2: (size - 2) | 0,

+ 36 - 0
src/mol-io/reader/csv/data-model.ts

@@ -0,0 +1,36 @@
+/**
+ * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Alexander Rose <alexander.rose@weirdbyte.de>
+ */
+
+import { Field as Column } from '../cif/data-model'
+
+export { Column }
+
+export interface File {
+    readonly name?: string,
+    readonly table: Table
+}
+
+export function File(table: Table, name?: string): File {
+    return { name, table };
+}
+
+export interface Table {
+    readonly rowCount: number,
+    readonly columnNames: ReadonlyArray<string>,
+    getColumn(name: string): Column | undefined
+}
+
+export function Table(rowCount: number, columnNames: string[], columns: Columns): Table {
+    return { rowCount, columnNames: [...columnNames], getColumn(name) { return columns[name]; } };
+}
+
+export type Columns = { [name: string]: Column }
+
+// export namespace Table {
+//     export function empty(name: string): Table {
+//         return { rowCount: 0, name, fieldNames: [], getColumn(name: string) { return void 0; } };
+//     };
+// }

+ 9 - 0
src/mol-io/reader/csv/field.ts

@@ -0,0 +1,9 @@
+/**
+ * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Alexander Rose <alexander.rose@weirdbyte.de>
+ */
+
+import Field from '../cif/text/field'
+
+export default Field

+ 287 - 0
src/mol-io/reader/csv/parser.ts

@@ -0,0 +1,287 @@
+/**
+ * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Alexander Rose <alexander.rose@weirdbyte.de>
+ */
+
+// import { Column } from 'mol-data/db'
+import { Tokens, TokenBuilder, Tokenizer } from '../common/text/tokenizer'
+import * as Data from './data-model'
+import Field from './field'
+import Result from '../result'
+import Computation from 'mol-util/computation'
+
+const enum CsvTokenType {
+    Value = 0,
+    Comment = 1,
+    End = 2
+}
+
+interface State {
+    data: string;
+    tokenizer: Tokenizer,
+
+    tokenType: CsvTokenType;
+    chunker: Computation.Chunker,
+    tokens: Tokens[],
+
+    fieldCount: number,
+    recordCount: number,
+
+    columnCount: number,
+    columnNames: string[],
+
+    quoteCharCode: number,
+    commentCharCode: number,
+    delimiterCharCode: number,
+
+    noColumnNamesRecord: boolean
+}
+
+function State(data: string, ctx: Computation.Context, opts: CsvOptions): State {
+
+    const tokenizer = Tokenizer(data)
+    return {
+        data,
+        tokenizer,
+
+        tokenType: CsvTokenType.End,
+        chunker: Computation.chunker(ctx, 100000),
+        tokens: [],
+
+        fieldCount: 0,
+        recordCount: 0,
+
+        columnCount: 0,
+        columnNames: [],
+
+        quoteCharCode: opts.quote.charCodeAt(0),
+        commentCharCode: opts.comment.charCodeAt(0),
+        delimiterCharCode: opts.delimiter.charCodeAt(0),
+        noColumnNamesRecord: opts.noColumnNames
+    };
+}
+
+/**
+ * Eat everything until a delimiter or newline occurs.
+ * Ignores whitespace at the end of the value, i.e. trim right.
+ * Returns true when a newline occurs after the value.
+ */
+function eatValue(state: Tokenizer, delimiterCharCode: number) {
+    while (state.position < state.length) {
+        const c = state.data.charCodeAt(state.position);
+        ++state.position
+        switch (c) {
+            case 10:  // \n
+            case 13:  // \r
+                return true;
+            case delimiterCharCode:
+                return;
+            case 9:  // \t
+            case 32:  // ' '
+                break;
+            default:
+                ++state.tokenEnd;
+                break;
+        }
+    }
+}
+
+/**
+ * Eats a quoted value. Can contain a newline.
+ * Returns true when a newline occurs after the quoted value.
+ *
+ * Embedded quotes are represented by a pair of double quotes:
+ * - ""xx"" => "xx"
+ */
+function eatQuoted(state: Tokenizer, quoteCharCode: number, delimiterCharCode: number) {
+    ++state.position;
+    while (state.position < state.length) {
+        const c = state.data.charCodeAt(state.position);
+        if (c === quoteCharCode) {
+            const next = state.data.charCodeAt(state.position + 1);
+            if (next !== quoteCharCode) {
+                // get rid of the quotes.
+                state.tokenStart++;
+                state.tokenEnd = state.position;
+                ++state.position;
+                return skipEmpty(state, delimiterCharCode)
+            }
+        }
+        ++state.position;
+    }
+    state.tokenEnd = state.position;
+}
+
+/**
+ * Skips empty chars.
+ * Returns true when the current char is a newline.
+ */
+function skipEmpty(state: Tokenizer, delimiterCharCode: number) {
+    while (state.position < state.length) {
+        const c = state.data.charCodeAt(state.position);
+        if (c !== 9 && c !== 32 && c !== delimiterCharCode) {  // \t or ' '
+            return c === 10 || c === 13;  // \n or \r
+        }
+        ++state.position
+    }
+}
+
+function skipWhitespace(state: Tokenizer) {
+    let prev = -1;
+    while (state.position < state.length) {
+        const c = state.data.charCodeAt(state.position);
+        switch (c) {
+            case 9:  // '\t'
+            case 32:  // ' '
+                prev = c;
+                ++state.position;
+                break;
+            case 10:  // \n
+                // handle \r\n
+                if (prev !== 13) {
+                    ++state.lineNumber;
+                }
+                prev = c;
+                ++state.position;
+                break;
+            case 13:  // \r
+                prev = c;
+                ++state.position;
+                ++state.lineNumber;
+                break;
+            default:
+                return;
+        }
+    }
+}
+
+function skipLine(state: Tokenizer) {
+    while (state.position < state.length) {
+        const c = state.data.charCodeAt(state.position);
+        if (c === 10 || c === 13) return  // \n or \r
+        ++state.position
+    }
+}
+
+/**
+ * Move to the next token.
+ * Returns true when the current char is a newline, i.e. indicating a full record.
+ */
+function moveNextInternal(state: State) {
+    const tokenizer = state.tokenizer
+    skipWhitespace(tokenizer);
+
+    if (tokenizer.position >= tokenizer.length) {
+        state.tokenType = CsvTokenType.End;
+        return true;
+    }
+
+    tokenizer.tokenStart = tokenizer.position;
+    tokenizer.tokenEnd = tokenizer.position;
+    const c = state.data.charCodeAt(tokenizer.position);
+    switch (c) {
+        case state.commentCharCode:
+            state.tokenType = CsvTokenType.Comment;
+            skipLine(tokenizer);
+            break;
+        case state.quoteCharCode:
+            state.tokenType = CsvTokenType.Value;
+            return eatQuoted(tokenizer, state.quoteCharCode, state.delimiterCharCode);
+        default:
+            state.tokenType = CsvTokenType.Value;
+            return eatValue(tokenizer, state.delimiterCharCode);
+    }
+}
+
+/**
+ * Moves to the next non-comment token/line.
+ * Returns true when the current char is a newline, i.e. indicating a full record.
+ */
+function moveNext(state: State) {
+    let newRecord = moveNextInternal(state);
+    while (state.tokenType === CsvTokenType.Comment) {
+        newRecord = moveNextInternal(state);
+    }
+    return newRecord
+}
+
+function readRecordsChunk(state: State, chunkSize: number) {
+    if (state.tokenType === CsvTokenType.End) return 0
+
+    let newRecord = moveNext(state);
+    if (newRecord) ++state.recordCount
+
+    const { tokens, tokenizer } = state;
+    let counter = 0;
+    while (state.tokenType === CsvTokenType.Value && counter < chunkSize) {
+        TokenBuilder.add(tokens[state.fieldCount % state.columnCount], tokenizer.tokenStart, tokenizer.tokenEnd);
+        ++state.fieldCount
+        newRecord = moveNext(state);
+        if (newRecord) ++state.recordCount
+        ++counter;
+    }
+    return counter;
+}
+
+function readRecordsChunks(state: State) {
+    return state.chunker.process(
+        chunkSize => readRecordsChunk(state, chunkSize),
+        update => update({ message: 'Parsing...', current: state.tokenizer.position, max: state.data.length }));
+}
+
+function addColumn (state: State) {
+    state.columnNames.push(Tokenizer.getTokenString(state.tokenizer))
+    state.tokens.push(TokenBuilder.create(state.tokenizer, state.data.length / 80))
+}
+
+function init(state: State) {
+    let newRecord = moveNext(state)
+    while (!newRecord) {
+        addColumn(state)
+        newRecord = moveNext(state);
+    }
+    addColumn(state)
+    state.columnCount = state.columnNames.length
+    if (state.noColumnNamesRecord) {
+        state.columnNames.forEach((x, i, arr) => arr[i] = i+'')
+        Tokenizer.reset(state.tokenizer)
+    }
+}
+
+async function handleRecords(state: State): Promise<Data.Table> {
+    init(state)
+    await readRecordsChunks(state)
+
+    const columns: Data.Columns = Object.create(null);
+    for (let i = 0; i < state.columnCount; ++i) {
+        columns[state.columnNames[i]] = Field(state.tokens[i], state.recordCount);
+    }
+
+    return Data.Table(state.recordCount, state.columnNames, columns)
+}
+
+async function parseInternal(data: string, ctx: Computation.Context, opts: CsvOptions): Promise<Result<Data.File>> {
+    const state = State(data, ctx, opts);
+
+    ctx.update({ message: 'Parsing...', current: 0, max: data.length });
+    const table = await handleRecords(state)
+    const result = Data.File(table)
+    return Result.success(result);
+}
+
+interface CsvOptions {
+    quote: string;
+    comment: string;
+    delimiter: string;
+    noColumnNames: boolean;
+}
+
+export function parse(data: string, opts?: Partial<CsvOptions>) {
+    const completeOpts = Object.assign({}, { quote: '"', comment: '#', delimiter: ',', noColumnNames: false }, opts)
+    return Computation.create<Result<Data.File>>(async ctx => {
+        return await parseInternal(data, ctx, completeOpts);
+    });
+}
+
+export default parse;

+ 7 - 0
src/mol-io/reader/csv/schema.ts

@@ -0,0 +1,7 @@
+/**
+ * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Alexander Rose <alexander.rose@weirdbyte.de>
+ */
+
+export { toTable } from '../cif/schema'

+ 34 - 39
src/script.ts

@@ -10,18 +10,15 @@ import * as fs from 'fs'
 
 require('util.promisify').shim();
 const readFileAsync = util.promisify(fs.readFile);
-const writeFileAsync = util.promisify(fs.writeFile);
 
 import Gro from 'mol-io/reader/gro/parser'
+import Csv from 'mol-io/reader/csv/parser'
 import CIF from 'mol-io/reader/cif'
 
 import Computation from 'mol-util/computation'
 
 import { Model } from 'mol-model/structure'
 
-// import { toTypedFrame as applySchema } from './reader/cif/schema'
-import { generateSchema } from 'mol-io/reader/cif/schema/utils'
-
 const file = '1crn.gro'
 // const file = 'water.gro'
 // const file = 'test.gro'
@@ -155,7 +152,7 @@ export async function _cif() {
     runCIF(input);
 
     path = `./examples/1cbs_full.bcif`;
-    
+
     const input2 = await readFileAsync(path)
     console.log('------------------');
     console.log('BinaryCIF:');
@@ -164,38 +161,7 @@ export async function _cif() {
     runCIF(input2);
 }
 
-_cif();
-
-async function runDic(input: string | Uint8Array) {
-    console.time('parseDic');
-    const comp = typeof input === 'string' ? CIF.parseText(input) : CIF.parseBinary(input);
-
-    const ctx = Computation.observable({ updateRateMs: 250, observer: p => showProgress('DIC', p) });
-    const parsed = await comp(ctx);
-    console.timeEnd('parseDic');
-    if (parsed.isError) {
-        console.log(parsed);
-        return;
-    }
-
-    const schema = generateSchema(parsed.result.blocks[0])
-    // console.log(schema)
-    // console.log(util.inspect(Object.keys(schema).length, {showHidden: false, depth: 1}))
-
-    await writeFileAsync('./src/reader/cif/schema/mmcif-gen.ts', schema, 'utf8')
-
-    return schema
-}
-
-export async function _dic() {
-    let path = './build/dics/mmcif_pdbx_v50.dic'
-    const input = await readFileAsync(path, 'utf8')
-    console.log('------------------');
-    console.log('Text DIC:');
-    return runDic(input);
-}
-
-_dic();
+// _cif();
 
 const comp = Computation.create(async ctx => {
     for (let i = 0; i < 0; i++) {
@@ -204,9 +170,38 @@ const comp = Computation.create(async ctx => {
     }
     return 42;
 });
-async function testComp() {
+export async function testComp() {
     const ctx = Computation.observable({ observer: p => showProgress('test', p) });
     const ret = await comp(ctx);
     console.log('computation returned', ret);
 }
-testComp();
+// testComp();
+
+
+const csvString = ` Year,Make,Model,Length
+1997,Ford,"E350
+
+MOIN",2.34
+2000,Mercury, Cougar,2.38`
+
+export async function testCsv () {
+    const parsed = await Csv(csvString)();
+
+    if (parsed.isError) {
+        console.log(parsed)
+        return;
+    }
+
+    const csvFile = parsed.result;
+    csvFile.table.columnNames.forEach(name => {
+        const col = csvFile.table.getColumn(name)
+        if (col) console.log(name, col.toStringArray())
+    })
+
+    const year = csvFile.table.getColumn('Year')
+    if (year) console.log('(int)Year', year.toIntArray())
+
+    const length = csvFile.table.getColumn('Length')
+    if (length) console.log('(float)Length', length.toFloatArray())
+}
+testCsv()