123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288 |
- /**
- * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info.
- *
- * @author Alexander Rose <alexander.rose@weirdbyte.de>
- */
- // import { Column } from 'mol-data/db'
- import { Tokens, TokenBuilder, Tokenizer } from '../common/text/tokenizer';
- import * as Data from './data-model';
- import { Field } from './field';
- import { ReaderResult as Result } from '../result';
- import { Task, RuntimeContext, chunkedSubtask, } from '../../../mol-task';
- const enum CsvTokenType {
- Value = 0,
- Comment = 1,
- End = 2
- }
- interface State {
- data: string;
- tokenizer: Tokenizer,
- tokenType: CsvTokenType;
- runtimeCtx: RuntimeContext,
- tokens: Tokens[],
- fieldCount: number,
- recordCount: number,
- columnCount: number,
- columnNames: string[],
- quoteCharCode: number,
- commentCharCode: number,
- delimiterCharCode: number,
- noColumnNamesRecord: boolean
- }
- function State(data: string, runtimeCtx: RuntimeContext, opts: CsvOptions): State {
- const tokenizer = Tokenizer(data);
- return {
- data,
- tokenizer,
- tokenType: CsvTokenType.End,
- runtimeCtx,
- tokens: [],
- fieldCount: 0,
- recordCount: 0,
- columnCount: 0,
- columnNames: [],
- quoteCharCode: opts.quote.charCodeAt(0),
- commentCharCode: opts.comment.charCodeAt(0),
- delimiterCharCode: opts.delimiter.charCodeAt(0),
- noColumnNamesRecord: opts.noColumnNames
- };
- }
- /**
- * Eat everything until a delimiter or newline occurs.
- * Ignores whitespace at the end of the value, i.e. trim right.
- * Returns true when a newline occurs after the value.
- */
- function eatValue(state: Tokenizer, delimiterCharCode: number) {
- while (state.position < state.length) {
- const c = state.data.charCodeAt(state.position);
- ++state.position;
- switch (c) {
- case 10: // \n
- case 13: // \r
- return true;
- case delimiterCharCode:
- return;
- case 9: // \t
- case 32: // ' '
- break;
- default:
- ++state.tokenEnd;
- break;
- }
- }
- }
- /**
- * Eats a quoted value. Can contain a newline.
- * Returns true when a newline occurs after the quoted value.
- *
- * Embedded quotes are represented by a pair of double quotes:
- * - ""xx"" => "xx"
- */
- function eatQuoted(state: Tokenizer, quoteCharCode: number, delimiterCharCode: number) {
- ++state.position;
- while (state.position < state.length) {
- const c = state.data.charCodeAt(state.position);
- if (c === quoteCharCode) {
- const next = state.data.charCodeAt(state.position + 1);
- if (next !== quoteCharCode) {
- // get rid of the quotes.
- state.tokenStart++;
- state.tokenEnd = state.position;
- ++state.position;
- return skipEmpty(state, delimiterCharCode);
- }
- }
- ++state.position;
- }
- state.tokenEnd = state.position;
- }
- /**
- * Skips empty chars.
- * Returns true when the current char is a newline.
- */
- function skipEmpty(state: Tokenizer, delimiterCharCode: number) {
- while (state.position < state.length) {
- const c = state.data.charCodeAt(state.position);
- if (c !== 9 && c !== 32 && c !== delimiterCharCode) { // \t or ' '
- return c === 10 || c === 13; // \n or \r
- }
- ++state.position;
- }
- }
- function skipWhitespace(state: Tokenizer) {
- let prev = -1;
- while (state.position < state.length) {
- const c = state.data.charCodeAt(state.position);
- switch (c) {
- case 9: // '\t'
- case 32: // ' '
- prev = c;
- ++state.position;
- break;
- case 10: // \n
- // handle \r\n
- if (prev !== 13) {
- ++state.lineNumber;
- }
- prev = c;
- ++state.position;
- break;
- case 13: // \r
- prev = c;
- ++state.position;
- ++state.lineNumber;
- break;
- default:
- return;
- }
- }
- }
- function skipLine(state: Tokenizer) {
- while (state.position < state.length) {
- const c = state.data.charCodeAt(state.position);
- if (c === 10 || c === 13) return; // \n or \r
- ++state.position;
- }
- }
- /**
- * Move to the next token.
- * Returns true when the current char is a newline, i.e. indicating a full record.
- */
- function moveNextInternal(state: State) {
- const tokenizer = state.tokenizer;
- skipWhitespace(tokenizer);
- if (tokenizer.position >= tokenizer.length) {
- state.tokenType = CsvTokenType.End;
- return false;
- }
- tokenizer.tokenStart = tokenizer.position;
- tokenizer.tokenEnd = tokenizer.position;
- const c = state.data.charCodeAt(tokenizer.position);
- switch (c) {
- case state.commentCharCode:
- state.tokenType = CsvTokenType.Comment;
- skipLine(tokenizer);
- break;
- case state.quoteCharCode:
- state.tokenType = CsvTokenType.Value;
- return eatQuoted(tokenizer, state.quoteCharCode, state.delimiterCharCode);
- default:
- state.tokenType = CsvTokenType.Value;
- return eatValue(tokenizer, state.delimiterCharCode);
- }
- }
- /**
- * Moves to the next non-comment token/line.
- * Returns true when the current char is a newline, i.e. indicating a full record.
- */
- function moveNext(state: State) {
- let newRecord = moveNextInternal(state);
- while (state.tokenType === CsvTokenType.Comment) {
- newRecord = moveNextInternal(state);
- }
- return newRecord;
- }
- function readRecordsChunk(chunkSize: number, state: State) {
- if (state.tokenType === CsvTokenType.End) return 0;
- let counter = 0;
- let newRecord: boolean | undefined;
- const { tokens, tokenizer } = state;
- while (state.tokenType === CsvTokenType.Value && counter < chunkSize) {
- TokenBuilder.add(tokens[state.fieldCount % state.columnCount], tokenizer.tokenStart, tokenizer.tokenEnd);
- ++state.fieldCount;
- newRecord = moveNext(state);
- if (newRecord) {
- ++state.recordCount;
- ++counter;
- }
- }
- return counter;
- }
- function readRecordsChunks(state: State) {
- const newRecord = moveNext(state);
- if (newRecord) ++state.recordCount;
- return chunkedSubtask(state.runtimeCtx, 100000, state, readRecordsChunk,
- (ctx, state) => ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.data.length }));
- }
- function addColumn(state: State) {
- state.columnNames.push(Tokenizer.getTokenString(state.tokenizer));
- state.tokens.push(TokenBuilder.create(state.tokenizer.data, state.data.length / 80));
- }
- function init(state: State) {
- let newRecord = moveNext(state);
- while (!newRecord) {
- addColumn(state);
- newRecord = moveNext(state);
- }
- addColumn(state);
- state.columnCount = state.columnNames.length;
- if (state.noColumnNamesRecord) {
- state.columnNames.forEach((x, i, arr) => arr[i] = i + '');
- Tokenizer.reset(state.tokenizer);
- }
- }
- async function handleRecords(state: State): Promise<Data.CsvTable> {
- init(state);
- await readRecordsChunks(state);
- const columns: Data.CsvColumns = Object.create(null);
- for (let i = 0; i < state.columnCount; ++i) {
- columns[state.columnNames[i]] = Field(state.tokens[i]);
- }
- return Data.CsvTable(state.recordCount, state.columnNames, columns);
- }
- async function parseInternal(data: string, ctx: RuntimeContext, opts: CsvOptions): Promise<Result<Data.CsvFile>> {
- const state = State(data, ctx, opts);
- ctx.update({ message: 'Parsing...', current: 0, max: data.length });
- const table = await handleRecords(state);
- const result = Data.CsvFile(table);
- return Result.success(result);
- }
- interface CsvOptions {
- quote: string;
- comment: string;
- delimiter: string;
- noColumnNames: boolean;
- }
- export function parseCsv(data: string, opts?: Partial<CsvOptions>) {
- const completeOpts = Object.assign({}, { quote: '"', comment: '#', delimiter: ',', noColumnNames: false }, opts);
- return Task.create<Result<Data.CsvFile>>('Parse CSV', async ctx => {
- return await parseInternal(data, ctx, completeOpts);
- });
- }
|