tokenizer.ts 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. /**
  2. * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info.
  3. *
  4. * mostly from https://github.com/dsehnal/CIFTools.js
  5. * @author David Sehnal <david.sehnal@gmail.com>
  6. * @author Alexander Rose <alexander.rose@weirdbyte.de>
  7. */
  8. import { chunkedSubtask, RuntimeContext } from '../../../../mol-task';
  9. export { Tokenizer };
  10. interface Tokenizer {
  11. data: string,
  12. position: number,
  13. length: number,
  14. lineNumber: number,
  15. tokenStart: number,
  16. tokenEnd: number
  17. }
  18. export interface Tokens {
  19. data: string,
  20. count: number,
  21. indices: ArrayLike<number>
  22. }
  23. function Tokenizer(data: string): Tokenizer {
  24. return {
  25. data,
  26. position: 0,
  27. length: data.length,
  28. lineNumber: 1,
  29. tokenStart: 0,
  30. tokenEnd: 0
  31. };
  32. }
  33. namespace Tokenizer {
  34. export function getTokenString(state: Tokenizer) {
  35. return state.data.substring(state.tokenStart, state.tokenEnd);
  36. }
  37. /** Resets the state */
  38. export function reset(state: Tokenizer) {
  39. state.position = 0;
  40. state.lineNumber = 1;
  41. state.tokenStart = 0;
  42. state.tokenEnd = 0;
  43. }
  44. /**
  45. * Eat everything until a newline occurs.
  46. */
  47. export function eatLine(state: Tokenizer): boolean {
  48. const { data } = state;
  49. while (state.position < state.length) {
  50. switch (data.charCodeAt(state.position)) {
  51. case 10: // \n
  52. state.tokenEnd = state.position;
  53. ++state.position;
  54. ++state.lineNumber;
  55. return true;
  56. case 13: // \r
  57. state.tokenEnd = state.position;
  58. ++state.position;
  59. ++state.lineNumber;
  60. if (data.charCodeAt(state.position) === 10) {
  61. ++state.position;
  62. }
  63. return true;
  64. default:
  65. ++state.position;
  66. break;
  67. }
  68. }
  69. state.tokenEnd = state.position;
  70. return state.tokenStart !== state.tokenEnd;
  71. }
  72. /** Sets the current token start to the current position */
  73. export function markStart(state: Tokenizer) {
  74. state.tokenStart = state.position;
  75. }
  76. /** Sets the current token start to current position and moves to the next line. */
  77. export function markLine(state: Tokenizer) {
  78. state.tokenStart = state.position;
  79. return eatLine(state);
  80. }
  81. /** Advance the state and return line as string. */
  82. export function readLine(state: Tokenizer): string {
  83. markLine(state);
  84. return getTokenString(state);
  85. }
  86. /** Advance the state and return trimmed line as string. */
  87. export function readLineTrim(state: Tokenizer): string {
  88. markLine(state);
  89. const position = state.position;
  90. trim(state, state.tokenStart, state.tokenEnd);
  91. state.position = position;
  92. return getTokenString(state);
  93. }
  94. function readLinesChunk(state: Tokenizer, count: number, tokens: Tokens) {
  95. let read = 0;
  96. for (let i = 0; i < count; i++) {
  97. if (!markLine(state)) return read;
  98. TokenBuilder.addUnchecked(tokens, state.tokenStart, state.tokenEnd);
  99. read++;
  100. }
  101. return read;
  102. }
  103. /** Advance the state by the given number of lines and return them*/
  104. export function markLines(state: Tokenizer, count: number): Tokens {
  105. const lineTokens = TokenBuilder.create(state.data, count * 2);
  106. readLinesChunk(state, count, lineTokens);
  107. return lineTokens;
  108. }
  109. /** Advance the state by the given number of lines and return them */
  110. export function readLines(state: Tokenizer, count: number): string[] {
  111. const ret: string[] = [];
  112. for (let i = 0; i < count; i++) {
  113. ret.push(Tokenizer.readLine(state));
  114. }
  115. return ret;
  116. }
  117. /** Advance the state by the given number of lines and return line starts/ends as tokens. */
  118. export async function readLinesAsync(state: Tokenizer, count: number, ctx: RuntimeContext, initialLineCount = 100000): Promise<Tokens> {
  119. const lineTokens = TokenBuilder.create(state.data, count * 2);
  120. let linesAlreadyRead = 0;
  121. await chunkedSubtask(ctx, initialLineCount, state, (chunkSize, state) => {
  122. const linesToRead = Math.min(count - linesAlreadyRead, chunkSize);
  123. readLinesChunk(state, linesToRead, lineTokens);
  124. linesAlreadyRead += linesToRead;
  125. return linesToRead;
  126. }, (ctx, state) => ctx.update({ message: 'Parsing...', current: state.position, max: state.length }));
  127. return lineTokens;
  128. }
  129. export function readAllLines(data: string) {
  130. const state = Tokenizer(data);
  131. const tokens = TokenBuilder.create(state.data, Math.max(data.length / 80, 2));
  132. while (markLine(state)) {
  133. TokenBuilder.add(tokens, state.tokenStart, state.tokenEnd);
  134. }
  135. return tokens;
  136. }
  137. function readLinesChunkChecked(state: Tokenizer, count: number, tokens: Tokens) {
  138. let read = 0;
  139. for (let i = 0; i < count; i++) {
  140. if (!markLine(state)) return read;
  141. TokenBuilder.add(tokens, state.tokenStart, state.tokenEnd);
  142. read++;
  143. }
  144. return read;
  145. }
  146. export async function readAllLinesAsync(data: string, ctx: RuntimeContext, chunkSize = 100000) {
  147. const state = Tokenizer(data);
  148. const tokens = TokenBuilder.create(state.data, Math.max(data.length / 80, 2));
  149. await chunkedSubtask(ctx, chunkSize, state, (chunkSize, state) => {
  150. readLinesChunkChecked(state, chunkSize, tokens);
  151. return state.position < state.length ? chunkSize : 0;
  152. }, (ctx, state) => ctx.update({ message: 'Parsing...', current: state.position, max: state.length }));
  153. return tokens;
  154. }
  155. /**
  156. * Eat everything until a whitespace/newline occurs.
  157. */
  158. export function eatValue(state: Tokenizer) {
  159. while (state.position < state.length) {
  160. switch (state.data.charCodeAt(state.position)) {
  161. case 9: // \t
  162. case 10: // \n
  163. case 13: // \r
  164. case 32: // ' '
  165. state.tokenEnd = state.position;
  166. return;
  167. default:
  168. ++state.position;
  169. break;
  170. }
  171. }
  172. state.tokenEnd = state.position;
  173. }
  174. /**
  175. * Skips all the whitespace - space, tab, newline, CR
  176. * Handles incrementing line count.
  177. */
  178. export function skipWhitespace(state: Tokenizer): number {
  179. let prev = -1;
  180. while (state.position < state.length) {
  181. const c = state.data.charCodeAt(state.position);
  182. switch (c) {
  183. case 9: // '\t'
  184. case 32: // ' '
  185. prev = c;
  186. ++state.position;
  187. break;
  188. case 10: // \n
  189. // handle \r\n
  190. if (prev !== 13) {
  191. ++state.lineNumber;
  192. }
  193. prev = c;
  194. ++state.position;
  195. break;
  196. case 13: // \r
  197. prev = c;
  198. ++state.position;
  199. ++state.lineNumber;
  200. break;
  201. default:
  202. return prev;
  203. }
  204. }
  205. return prev;
  206. }
  207. /** Trims spaces and tabs */
  208. export function trim(state: Tokenizer, start: number, end: number) {
  209. const { data } = state;
  210. let s = start, e = end - 1;
  211. let c = data.charCodeAt(s);
  212. while ((c === 9 || c === 32) && s <= e) c = data.charCodeAt(++s);
  213. c = data.charCodeAt(e);
  214. while ((c === 9 || c === 32) && e >= s) c = data.charCodeAt(--e);
  215. state.tokenStart = s;
  216. state.tokenEnd = e + 1;
  217. state.position = end;
  218. return state;
  219. }
  220. }
  221. export function trimStr(data: string, start: number, end: number) {
  222. let s = start, e = end - 1;
  223. let c = data.charCodeAt(s);
  224. while ((c === 9 || c === 32) && s <= e) c = data.charCodeAt(++s);
  225. c = data.charCodeAt(e);
  226. while ((c === 9 || c === 32) && e >= s) c = data.charCodeAt(--e);
  227. return data.substring(s, e + 1);
  228. }
  229. export namespace TokenBuilder {
  230. interface Builder extends Tokens {
  231. offset: number,
  232. indices: Uint32Array,
  233. indicesLenMinus2: number
  234. }
  235. function resize(builder: Builder) {
  236. // scale the size using golden ratio, because why not.
  237. const newBuffer = new Uint32Array((1.61 * builder.indices.length) | 0);
  238. newBuffer.set(builder.indices);
  239. builder.indices = newBuffer;
  240. builder.indicesLenMinus2 = (newBuffer.length - 2) | 0;
  241. }
  242. export function add(tokens: Tokens, start: number, end: number) {
  243. const builder = tokens as Builder;
  244. if (builder.offset > builder.indicesLenMinus2) {
  245. resize(builder);
  246. }
  247. builder.indices[builder.offset++] = start;
  248. builder.indices[builder.offset++] = end;
  249. tokens.count++;
  250. }
  251. export function addToken(tokens: Tokens, tokenizer: Tokenizer) {
  252. add(tokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  253. }
  254. export function addUnchecked(tokens: Tokens, start: number, end: number) {
  255. (tokens as Builder).indices[(tokens as Builder).offset++] = start;
  256. (tokens as Builder).indices[(tokens as Builder).offset++] = end;
  257. tokens.count++;
  258. }
  259. export function create(data: string, size: number): Tokens {
  260. size = Math.max(10, size);
  261. return <Builder>{
  262. data,
  263. indicesLenMinus2: (size - 2) | 0,
  264. count: 0,
  265. offset: 0,
  266. indices: new Uint32Array(size)
  267. };
  268. }
  269. }