tokenizer.ts 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. /**
  2. * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info.
  3. *
  4. * mostly from https://github.com/dsehnal/CIFTools.js
  5. * @author David Sehnal <david.sehnal@gmail.com>
  6. * @author Alexander Rose <alexander.rose@weirdbyte.de>
  7. */
  8. import { chunkedSubtask, RuntimeContext } from '../../../../mol-task';
  9. export { Tokenizer };
  10. interface Tokenizer {
  11. data: string,
  12. position: number,
  13. length: number,
  14. lineNumber: number,
  15. tokenStart: number,
  16. tokenEnd: number
  17. }
  18. export interface Tokens {
  19. data: string,
  20. count: number,
  21. indices: ArrayLike<number>
  22. }
  23. function Tokenizer(data: string): Tokenizer {
  24. return {
  25. data,
  26. position: 0,
  27. length: data.length,
  28. lineNumber: 1,
  29. tokenStart: 0,
  30. tokenEnd: 0
  31. };
  32. }
  33. namespace Tokenizer {
  34. export function getTokenString(state: Tokenizer) {
  35. return state.data.substring(state.tokenStart, state.tokenEnd);
  36. }
  37. /** Resets the state */
  38. export function reset(state: Tokenizer) {
  39. state.position = 0;
  40. state.lineNumber = 1;
  41. state.tokenStart = 0;
  42. state.tokenEnd = 0;
  43. }
  44. /**
  45. * Eat everything until a newline occurs.
  46. */
  47. export function eatLine(state: Tokenizer): boolean {
  48. const { data } = state;
  49. while (state.position < state.length) {
  50. switch (data.charCodeAt(state.position)) {
  51. case 10: // \n
  52. state.tokenEnd = state.position;
  53. ++state.position;
  54. ++state.lineNumber;
  55. return true;
  56. case 13: // \r
  57. state.tokenEnd = state.position;
  58. ++state.position;
  59. ++state.lineNumber;
  60. if (data.charCodeAt(state.position) === 10) {
  61. ++state.position;
  62. }
  63. return true;
  64. default:
  65. ++state.position;
  66. break;
  67. }
  68. }
  69. state.tokenEnd = state.position;
  70. return state.tokenStart !== state.tokenEnd;
  71. }
  72. /** Sets the current token start to the current position */
  73. export function markStart(state: Tokenizer) {
  74. state.tokenStart = state.position;
  75. }
  76. /** Sets the current token start to current position and moves to the next line. */
  77. export function markLine(state: Tokenizer) {
  78. state.tokenStart = state.position;
  79. return eatLine(state);
  80. }
  81. /** Advance the state and return line as string. */
  82. export function readLine(state: Tokenizer): string {
  83. markLine(state);
  84. return getTokenString(state);
  85. }
  86. /** Advance the state and return trimmed line as string. */
  87. export function readLineTrim(state: Tokenizer): string {
  88. markLine(state);
  89. const position = state.position;
  90. trim(state, state.tokenStart, state.tokenEnd);
  91. state.position = position;
  92. return getTokenString(state);
  93. }
  94. function readLinesChunk(state: Tokenizer, count: number, tokens: Tokens) {
  95. let read = 0;
  96. for (let i = 0; i < count; i++) {
  97. if (!markLine(state)) return read;
  98. TokenBuilder.addUnchecked(tokens, state.tokenStart, state.tokenEnd);
  99. read++;
  100. }
  101. return read;
  102. }
  103. /** Advance the state by the given number of lines and return them*/
  104. export function markLines(state: Tokenizer, count: number): Tokens {
  105. const lineTokens = TokenBuilder.create(state.data, count * 2);
  106. readLinesChunk(state, count, lineTokens);
  107. return lineTokens;
  108. }
  109. /** Advance the state by the given number of lines and return them */
  110. export function readLines(state: Tokenizer, count: number): string[] {
  111. const ret: string[] = [];
  112. for (let i = 0; i < count; i++) {
  113. ret.push(Tokenizer.readLine(state));
  114. }
  115. return ret;
  116. }
  117. /** Advance the state by the given number of lines and return line starts/ends as tokens. */
  118. export async function readLinesAsync(state: Tokenizer, count: number, ctx: RuntimeContext, initialLineCount = 100000): Promise<Tokens> {
  119. const { length } = state;
  120. const lineTokens = TokenBuilder.create(state.data, count * 2);
  121. let linesAlreadyRead = 0;
  122. await chunkedSubtask(ctx, initialLineCount, state, (chunkSize, state) => {
  123. const linesToRead = Math.min(count - linesAlreadyRead, chunkSize);
  124. readLinesChunk(state, linesToRead, lineTokens);
  125. linesAlreadyRead += linesToRead;
  126. return linesToRead;
  127. }, (ctx, state) => ctx.update({ message: 'Parsing...', current: state.position, max: length }));
  128. return lineTokens;
  129. }
  130. export function readAllLines(data: string) {
  131. const state = Tokenizer(data);
  132. const tokens = TokenBuilder.create(state.data, Math.max(data.length / 80, 2));
  133. while (markLine(state)) {
  134. TokenBuilder.add(tokens, state.tokenStart, state.tokenEnd);
  135. }
  136. return tokens;
  137. }
  138. function readLinesChunkChecked(state: Tokenizer, count: number, tokens: Tokens) {
  139. let read = 0;
  140. for (let i = 0; i < count; i++) {
  141. if (!markLine(state)) return read;
  142. TokenBuilder.add(tokens, state.tokenStart, state.tokenEnd);
  143. read++;
  144. }
  145. return read;
  146. }
  147. export async function readAllLinesAsync(data: string, ctx: RuntimeContext, chunkSize = 100000) {
  148. const state = Tokenizer(data);
  149. const tokens = TokenBuilder.create(state.data, Math.max(data.length / 80, 2));
  150. await chunkedSubtask(ctx, chunkSize, state, (chunkSize, state) => {
  151. readLinesChunkChecked(state, chunkSize, tokens);
  152. return state.position < state.length ? chunkSize : 0;
  153. }, (ctx, state) => ctx.update({ message: 'Parsing...', current: state.position, max: length }));
  154. return tokens;
  155. }
  156. /**
  157. * Eat everything until a whitespace/newline occurs.
  158. */
  159. export function eatValue(state: Tokenizer) {
  160. while (state.position < state.length) {
  161. switch (state.data.charCodeAt(state.position)) {
  162. case 9: // \t
  163. case 10: // \n
  164. case 13: // \r
  165. case 32: // ' '
  166. state.tokenEnd = state.position;
  167. return;
  168. default:
  169. ++state.position;
  170. break;
  171. }
  172. }
  173. state.tokenEnd = state.position;
  174. }
  175. /**
  176. * Skips all the whitespace - space, tab, newline, CR
  177. * Handles incrementing line count.
  178. */
  179. export function skipWhitespace(state: Tokenizer): number {
  180. let prev = -1;
  181. while (state.position < state.length) {
  182. const c = state.data.charCodeAt(state.position);
  183. switch (c) {
  184. case 9: // '\t'
  185. case 32: // ' '
  186. prev = c;
  187. ++state.position;
  188. break;
  189. case 10: // \n
  190. // handle \r\n
  191. if (prev !== 13) {
  192. ++state.lineNumber;
  193. }
  194. prev = c;
  195. ++state.position;
  196. break;
  197. case 13: // \r
  198. prev = c;
  199. ++state.position;
  200. ++state.lineNumber;
  201. break;
  202. default:
  203. return prev;
  204. }
  205. }
  206. return prev;
  207. }
  208. /** Trims spaces and tabs */
  209. export function trim(state: Tokenizer, start: number, end: number) {
  210. const { data } = state;
  211. let s = start, e = end - 1;
  212. let c = data.charCodeAt(s);
  213. while ((c === 9 || c === 32) && s <= e) c = data.charCodeAt(++s);
  214. c = data.charCodeAt(e);
  215. while ((c === 9 || c === 32) && e >= s) c = data.charCodeAt(--e);
  216. state.tokenStart = s;
  217. state.tokenEnd = e + 1;
  218. state.position = end;
  219. return state;
  220. }
  221. }
  222. export function trimStr(data: string, start: number, end: number) {
  223. let s = start, e = end - 1;
  224. let c = data.charCodeAt(s);
  225. while ((c === 9 || c === 32) && s <= e) c = data.charCodeAt(++s);
  226. c = data.charCodeAt(e);
  227. while ((c === 9 || c === 32) && e >= s) c = data.charCodeAt(--e);
  228. return data.substring(s, e + 1);
  229. }
  230. export namespace TokenBuilder {
  231. interface Builder extends Tokens {
  232. offset: number,
  233. indices: Uint32Array,
  234. indicesLenMinus2: number
  235. }
  236. function resize(builder: Builder) {
  237. // scale the size using golden ratio, because why not.
  238. const newBuffer = new Uint32Array((1.61 * builder.indices.length) | 0);
  239. newBuffer.set(builder.indices);
  240. builder.indices = newBuffer;
  241. builder.indicesLenMinus2 = (newBuffer.length - 2) | 0;
  242. }
  243. export function add(tokens: Tokens, start: number, end: number) {
  244. const builder = tokens as Builder;
  245. if (builder.offset > builder.indicesLenMinus2) {
  246. resize(builder);
  247. }
  248. builder.indices[builder.offset++] = start;
  249. builder.indices[builder.offset++] = end;
  250. tokens.count++;
  251. }
  252. export function addToken(tokens: Tokens, tokenizer: Tokenizer) {
  253. add(tokens, tokenizer.tokenStart, tokenizer.tokenEnd);
  254. }
  255. export function addUnchecked(tokens: Tokens, start: number, end: number) {
  256. (tokens as Builder).indices[(tokens as Builder).offset++] = start;
  257. (tokens as Builder).indices[(tokens as Builder).offset++] = end;
  258. tokens.count++;
  259. }
  260. export function create(data: string, size: number): Tokens {
  261. size = Math.max(10, size);
  262. return <Builder>{
  263. data,
  264. indicesLenMinus2: (size - 2) | 0,
  265. count: 0,
  266. offset: 0,
  267. indices: new Uint32Array(size)
  268. };
  269. }
  270. }