parser.ts 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. /**
  2. * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info.
  3. *
  4. * @author Alexander Rose <alexander.rose@weirdbyte.de>
  5. */
  6. // import { Column } from 'mol-data/db'
  7. import { Tokens, TokenBuilder, Tokenizer } from '../common/text/tokenizer';
  8. import * as Data from './data-model';
  9. import { Field } from './field';
  10. import { ReaderResult as Result } from '../result';
  11. import { Task, RuntimeContext, chunkedSubtask, } from '../../../mol-task';
  12. const enum CsvTokenType {
  13. Value = 0,
  14. Comment = 1,
  15. End = 2
  16. }
  17. interface State {
  18. data: string;
  19. tokenizer: Tokenizer,
  20. tokenType: CsvTokenType;
  21. runtimeCtx: RuntimeContext,
  22. tokens: Tokens[],
  23. fieldCount: number,
  24. recordCount: number,
  25. columnCount: number,
  26. columnNames: string[],
  27. quoteCharCode: number,
  28. commentCharCode: number,
  29. delimiterCharCode: number,
  30. noColumnNamesRecord: boolean
  31. }
  32. function State(data: string, runtimeCtx: RuntimeContext, opts: CsvOptions): State {
  33. const tokenizer = Tokenizer(data);
  34. return {
  35. data,
  36. tokenizer,
  37. tokenType: CsvTokenType.End,
  38. runtimeCtx,
  39. tokens: [],
  40. fieldCount: 0,
  41. recordCount: 0,
  42. columnCount: 0,
  43. columnNames: [],
  44. quoteCharCode: opts.quote.charCodeAt(0),
  45. commentCharCode: opts.comment.charCodeAt(0),
  46. delimiterCharCode: opts.delimiter.charCodeAt(0),
  47. noColumnNamesRecord: opts.noColumnNames
  48. };
  49. }
  50. /**
  51. * Eat everything until a delimiter or newline occurs.
  52. * Ignores whitespace at the end of the value, i.e. trim right.
  53. * Returns true when a newline occurs after the value.
  54. */
  55. function eatValue(state: Tokenizer, delimiterCharCode: number) {
  56. while (state.position < state.length) {
  57. const c = state.data.charCodeAt(state.position);
  58. ++state.position;
  59. switch (c) {
  60. case 10: // \n
  61. case 13: // \r
  62. return true;
  63. case delimiterCharCode:
  64. return;
  65. case 9: // \t
  66. case 32: // ' '
  67. break;
  68. default:
  69. ++state.tokenEnd;
  70. break;
  71. }
  72. }
  73. }
  74. /**
  75. * Eats a quoted value. Can contain a newline.
  76. * Returns true when a newline occurs after the quoted value.
  77. *
  78. * Embedded quotes are represented by a pair of double quotes:
  79. * - ""xx"" => "xx"
  80. */
  81. function eatQuoted(state: Tokenizer, quoteCharCode: number, delimiterCharCode: number) {
  82. ++state.position;
  83. while (state.position < state.length) {
  84. const c = state.data.charCodeAt(state.position);
  85. if (c === quoteCharCode) {
  86. const next = state.data.charCodeAt(state.position + 1);
  87. if (next !== quoteCharCode) {
  88. // get rid of the quotes.
  89. state.tokenStart++;
  90. state.tokenEnd = state.position;
  91. ++state.position;
  92. return skipEmpty(state, delimiterCharCode);
  93. }
  94. }
  95. ++state.position;
  96. }
  97. state.tokenEnd = state.position;
  98. }
  99. /**
  100. * Skips empty chars.
  101. * Returns true when the current char is a newline.
  102. */
  103. function skipEmpty(state: Tokenizer, delimiterCharCode: number) {
  104. while (state.position < state.length) {
  105. const c = state.data.charCodeAt(state.position);
  106. if (c !== 9 && c !== 32 && c !== delimiterCharCode) { // \t or ' '
  107. return c === 10 || c === 13; // \n or \r
  108. }
  109. ++state.position;
  110. }
  111. }
  112. function skipWhitespace(state: Tokenizer) {
  113. let prev = -1;
  114. while (state.position < state.length) {
  115. const c = state.data.charCodeAt(state.position);
  116. switch (c) {
  117. case 9: // '\t'
  118. case 32: // ' '
  119. prev = c;
  120. ++state.position;
  121. break;
  122. case 10: // \n
  123. // handle \r\n
  124. if (prev !== 13) {
  125. ++state.lineNumber;
  126. }
  127. prev = c;
  128. ++state.position;
  129. break;
  130. case 13: // \r
  131. prev = c;
  132. ++state.position;
  133. ++state.lineNumber;
  134. break;
  135. default:
  136. return;
  137. }
  138. }
  139. }
  140. function skipLine(state: Tokenizer) {
  141. while (state.position < state.length) {
  142. const c = state.data.charCodeAt(state.position);
  143. if (c === 10 || c === 13) return; // \n or \r
  144. ++state.position;
  145. }
  146. }
  147. /**
  148. * Move to the next token.
  149. * Returns true when the current char is a newline, i.e. indicating a full record.
  150. */
  151. function moveNextInternal(state: State) {
  152. const tokenizer = state.tokenizer;
  153. skipWhitespace(tokenizer);
  154. if (tokenizer.position >= tokenizer.length) {
  155. state.tokenType = CsvTokenType.End;
  156. return false;
  157. }
  158. tokenizer.tokenStart = tokenizer.position;
  159. tokenizer.tokenEnd = tokenizer.position;
  160. const c = state.data.charCodeAt(tokenizer.position);
  161. switch (c) {
  162. case state.commentCharCode:
  163. state.tokenType = CsvTokenType.Comment;
  164. skipLine(tokenizer);
  165. break;
  166. case state.quoteCharCode:
  167. state.tokenType = CsvTokenType.Value;
  168. return eatQuoted(tokenizer, state.quoteCharCode, state.delimiterCharCode);
  169. default:
  170. state.tokenType = CsvTokenType.Value;
  171. return eatValue(tokenizer, state.delimiterCharCode);
  172. }
  173. }
  174. /**
  175. * Moves to the next non-comment token/line.
  176. * Returns true when the current char is a newline, i.e. indicating a full record.
  177. */
  178. function moveNext(state: State) {
  179. let newRecord = moveNextInternal(state);
  180. while (state.tokenType === CsvTokenType.Comment) {
  181. newRecord = moveNextInternal(state);
  182. }
  183. return newRecord;
  184. }
  185. function readRecordsChunk(chunkSize: number, state: State) {
  186. if (state.tokenType === CsvTokenType.End) return 0;
  187. let counter = 0;
  188. let newRecord: boolean | undefined;
  189. const { tokens, tokenizer } = state;
  190. while (state.tokenType === CsvTokenType.Value && counter < chunkSize) {
  191. TokenBuilder.add(tokens[state.fieldCount % state.columnCount], tokenizer.tokenStart, tokenizer.tokenEnd);
  192. ++state.fieldCount;
  193. newRecord = moveNext(state);
  194. if (newRecord) {
  195. ++state.recordCount;
  196. ++counter;
  197. }
  198. }
  199. return counter;
  200. }
  201. function readRecordsChunks(state: State) {
  202. const newRecord = moveNext(state);
  203. if (newRecord) ++state.recordCount;
  204. return chunkedSubtask(state.runtimeCtx, 100000, state, readRecordsChunk,
  205. (ctx, state) => ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.data.length }));
  206. }
  207. function addColumn(state: State) {
  208. state.columnNames.push(Tokenizer.getTokenString(state.tokenizer));
  209. state.tokens.push(TokenBuilder.create(state.tokenizer.data, state.data.length / 80));
  210. }
  211. function init(state: State) {
  212. let newRecord = moveNext(state);
  213. while (!newRecord) {
  214. addColumn(state);
  215. newRecord = moveNext(state);
  216. }
  217. addColumn(state);
  218. state.columnCount = state.columnNames.length;
  219. if (state.noColumnNamesRecord) {
  220. state.columnNames.forEach((x, i, arr) => arr[i] = i + '');
  221. Tokenizer.reset(state.tokenizer);
  222. }
  223. }
  224. async function handleRecords(state: State): Promise<Data.CsvTable> {
  225. init(state);
  226. await readRecordsChunks(state);
  227. const columns: Data.CsvColumns = Object.create(null);
  228. for (let i = 0; i < state.columnCount; ++i) {
  229. columns[state.columnNames[i]] = Field(state.tokens[i]);
  230. }
  231. return Data.CsvTable(state.recordCount, state.columnNames, columns);
  232. }
  233. async function parseInternal(data: string, ctx: RuntimeContext, opts: CsvOptions): Promise<Result<Data.CsvFile>> {
  234. const state = State(data, ctx, opts);
  235. ctx.update({ message: 'Parsing...', current: 0, max: data.length });
  236. const table = await handleRecords(state);
  237. const result = Data.CsvFile(table);
  238. return Result.success(result);
  239. }
  240. interface CsvOptions {
  241. quote: string;
  242. comment: string;
  243. delimiter: string;
  244. noColumnNames: boolean;
  245. }
  246. export function parseCsv(data: string, opts?: Partial<CsvOptions>) {
  247. const completeOpts = Object.assign({}, { quote: '"', comment: '#', delimiter: ',', noColumnNames: false }, opts);
  248. return Task.create<Result<Data.CsvFile>>('Parse CSV', async ctx => {
  249. return await parseInternal(data, ctx, completeOpts);
  250. });
  251. }