web
/
TmMolStar


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
							/**
 * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info.
 *
 * @author Alexander Rose <alexander.rose@weirdbyte.de>
 */

// import { Column } from 'mol-data/db'
import { Tokens, TokenBuilder, Tokenizer } from '../common/text/tokenizer';
import * as Data from './data-model';
import { Field } from './field';
import { ReaderResult as Result } from '../result';
import { Task, RuntimeContext, chunkedSubtask, } from '../../../mol-task';

const enum CsvTokenType {
    Value = 0,
    Comment = 1,
    End = 2
}

interface State {
    data: string;
    tokenizer: Tokenizer,

    tokenType: CsvTokenType;
    runtimeCtx: RuntimeContext,
    tokens: Tokens[],

    fieldCount: number,
    recordCount: number,

    columnCount: number,
    columnNames: string[],

    quoteCharCode: number,
    commentCharCode: number,
    delimiterCharCode: number,

    noColumnNamesRecord: boolean
}

function State(data: string, runtimeCtx: RuntimeContext, opts: CsvOptions): State {

    const tokenizer = Tokenizer(data);
    return {
        data,
        tokenizer,

        tokenType: CsvTokenType.End,
        runtimeCtx,
        tokens: [],

        fieldCount: 0,
        recordCount: 0,

        columnCount: 0,
        columnNames: [],

        quoteCharCode: opts.quote.charCodeAt(0),
        commentCharCode: opts.comment.charCodeAt(0),
        delimiterCharCode: opts.delimiter.charCodeAt(0),
        noColumnNamesRecord: opts.noColumnNames
    };
}

/**
 * Eat everything until a delimiter or newline occurs.
 * Ignores whitespace at the end of the value, i.e. trim right.
 * Returns true when a newline occurs after the value.
 */
function eatValue(state: Tokenizer, delimiterCharCode: number) {
    while (state.position < state.length) {
        const c = state.data.charCodeAt(state.position);
        ++state.position;
        switch (c) {
            case 10: // \n
            case 13: // \r
                return true;
            case delimiterCharCode:
                return;
            case 9: // \t
            case 32: // ' '
                break;
            default:
                ++state.tokenEnd;
                break;
        }
    }
}

/**
 * Eats a quoted value. Can contain a newline.
 * Returns true when a newline occurs after the quoted value.
 *
 * Embedded quotes are represented by a pair of double quotes:
 * - ""xx"" => "xx"
 */
function eatQuoted(state: Tokenizer, quoteCharCode: number, delimiterCharCode: number) {
    ++state.position;
    while (state.position < state.length) {
        const c = state.data.charCodeAt(state.position);
        if (c === quoteCharCode) {
            const next = state.data.charCodeAt(state.position + 1);
            if (next !== quoteCharCode) {
                // get rid of the quotes.
                state.tokenStart++;
                state.tokenEnd = state.position;
                ++state.position;
                return skipEmpty(state, delimiterCharCode);
            }
        }
        ++state.position;
    }
    state.tokenEnd = state.position;
}

/**
 * Skips empty chars.
 * Returns true when the current char is a newline.
 */
function skipEmpty(state: Tokenizer, delimiterCharCode: number) {
    while (state.position < state.length) {
        const c = state.data.charCodeAt(state.position);
        if (c !== 9 && c !== 32 && c !== delimiterCharCode) { // \t or ' '
            return c === 10 || c === 13; // \n or \r
        }
        ++state.position;
    }
}

function skipWhitespace(state: Tokenizer) {
    let prev = -1;
    while (state.position < state.length) {
        const c = state.data.charCodeAt(state.position);
        switch (c) {
            case 9: // '\t'
            case 32: // ' '
                prev = c;
                ++state.position;
                break;
            case 10: // \n
                // handle \r\n
                if (prev !== 13) {
                    ++state.lineNumber;
                }
                prev = c;
                ++state.position;
                break;
            case 13: // \r
                prev = c;
                ++state.position;
                ++state.lineNumber;
                break;
            default:
                return;
        }
    }
}

function skipLine(state: Tokenizer) {
    while (state.position < state.length) {
        const c = state.data.charCodeAt(state.position);
        if (c === 10 || c === 13) return; // \n or \r
        ++state.position;
    }
}

/**
 * Move to the next token.
 * Returns true when the current char is a newline, i.e. indicating a full record.
 */
function moveNextInternal(state: State) {
    const tokenizer = state.tokenizer;
    skipWhitespace(tokenizer);

    if (tokenizer.position >= tokenizer.length) {
        state.tokenType = CsvTokenType.End;
        return false;
    }

    tokenizer.tokenStart = tokenizer.position;
    tokenizer.tokenEnd = tokenizer.position;
    const c = state.data.charCodeAt(tokenizer.position);
    switch (c) {
        case state.commentCharCode:
            state.tokenType = CsvTokenType.Comment;
            skipLine(tokenizer);
            break;
        case state.quoteCharCode:
            state.tokenType = CsvTokenType.Value;
            return eatQuoted(tokenizer, state.quoteCharCode, state.delimiterCharCode);
        default:
            state.tokenType = CsvTokenType.Value;
            return eatValue(tokenizer, state.delimiterCharCode);
    }
}

/**
 * Moves to the next non-comment token/line.
 * Returns true when the current char is a newline, i.e. indicating a full record.
 */
function moveNext(state: State) {
    let newRecord = moveNextInternal(state);
    while (state.tokenType === CsvTokenType.Comment) {
        newRecord = moveNextInternal(state);
    }
    return newRecord;
}

function readRecordsChunk(chunkSize: number, state: State) {
    if (state.tokenType === CsvTokenType.End) return 0;

    let counter = 0;
    let newRecord: boolean | undefined;

    const { tokens, tokenizer } = state;

    while (state.tokenType === CsvTokenType.Value && counter < chunkSize) {
        TokenBuilder.add(tokens[state.fieldCount % state.columnCount], tokenizer.tokenStart, tokenizer.tokenEnd);
        ++state.fieldCount;
        newRecord = moveNext(state);
        if (newRecord) {
            ++state.recordCount;
            ++counter;
        }
    }
    return counter;
}

function readRecordsChunks(state: State) {
    const newRecord = moveNext(state);
    if (newRecord) ++state.recordCount;
    return chunkedSubtask(state.runtimeCtx, 100000, state, readRecordsChunk,
        (ctx, state) => ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.data.length }));
}

function addColumn(state: State) {
    state.columnNames.push(Tokenizer.getTokenString(state.tokenizer));
    state.tokens.push(TokenBuilder.create(state.tokenizer.data, state.data.length / 80));
}

function init(state: State) {
    let newRecord = moveNext(state);
    while (!newRecord) {
        addColumn(state);
        newRecord = moveNext(state);
    }
    addColumn(state);
    state.columnCount = state.columnNames.length;
    if (state.noColumnNamesRecord) {
        state.columnNames.forEach((x, i, arr) => arr[i] = i + '');
        Tokenizer.reset(state.tokenizer);
    }
}

async function handleRecords(state: State): Promise<Data.CsvTable> {
    init(state);
    await readRecordsChunks(state);

    const columns: Data.CsvColumns = Object.create(null);
    for (let i = 0; i < state.columnCount; ++i) {
        columns[state.columnNames[i]] = Field(state.tokens[i]);
    }

    return Data.CsvTable(state.recordCount, state.columnNames, columns);
}

async function parseInternal(data: string, ctx: RuntimeContext, opts: CsvOptions): Promise<Result<Data.CsvFile>> {
    const state = State(data, ctx, opts);

    ctx.update({ message: 'Parsing...', current: 0, max: data.length });
    const table = await handleRecords(state);
    const result = Data.CsvFile(table);
    return Result.success(result);
}

interface CsvOptions {
    quote: string;
    comment: string;
    delimiter: string;
    noColumnNames: boolean;
}

export function parseCsv(data: string, opts?: Partial<CsvOptions>) {
    const completeOpts = Object.assign({}, { quote: '"', comment: '#', delimiter: ',', noColumnNames: false }, opts);
    return Task.create<Result<Data.CsvFile>>('Parse CSV', async ctx => {
        return await parseInternal(data, ctx, completeOpts);
    });
}