Browse Source

wip PDB parser

David Sehnal 6 years ago
parent
commit
641132e752

+ 48 - 2
src/mol-io/reader/cif/text/field.ts

@@ -29,8 +29,9 @@ export default function CifTextField(tokens: Tokens, rowCount: number): Data.Cif
     };
 
     const valueKind: Data.CifField['valueKind'] = row => {
-        const s = indices[2 * row];
-        if (indices[2 * row + 1] - s !== 1) return Column.ValueKind.Present;
+        const s = indices[2 * row], l = indices[2 * row + 1] - s;
+        if (l > 1) return Column.ValueKind.Present;
+        if (l === 0) return Column.ValueKind.NotPresent;
         const v = data.charCodeAt(s);
         if (v === 46 /* . */) return Column.ValueKind.NotPresent;
         if (v === 63 /* ? */) return Column.ValueKind.Unknown;
@@ -51,4 +52,49 @@ export default function CifTextField(tokens: Tokens, rowCount: number): Data.Cif
         toIntArray: params => ColumnHelpers.createAndFillArray(rowCount, int, params),
         toFloatArray: params => ColumnHelpers.createAndFillArray(rowCount, float, params)
     }
+}
+
+export function CifTextValueField(values: string[]): Data.CifField {
+    const rowCount = values.length;
+
+    const str: Data.CifField['str'] = row => {
+        const ret = values[row];
+        if (!ret || ret === '.' || ret === '?') return '';
+        return ret;
+    };
+
+    const int: Data.CifField['int'] = row => {
+        const v = values[row];
+        return fastParseInt(v, 0, v.length) || 0;
+    };
+
+    const float: Data.CifField['float'] = row => {
+        const v = values[row];
+        return fastParseFloat(v, 0, v.length) || 0;
+    };
+
+    const valueKind: Data.CifField['valueKind'] = row => {
+        const v = values[row], l = v.length;
+        if (l > 1) return Column.ValueKind.Present;
+        if (l === 0) return Column.ValueKind.NotPresent;
+        const c = v.charCodeAt(0);
+        if (c === 46 /* . */) return Column.ValueKind.NotPresent;
+        if (c === 63 /* ? */) return Column.ValueKind.Unknown;
+        return Column.ValueKind.Present;
+    };
+
+    return {
+        __array: void 0,
+        binaryEncoding: void 0,
+        isDefined: true,
+        rowCount,
+        str,
+        int,
+        float,
+        valueKind,
+        areValuesEqual: (rowA, rowB) => values[rowA] === values[rowB],
+        toStringArray: params => ColumnHelpers.createAndFillArray(rowCount, str, params),
+        toIntArray: params => ColumnHelpers.createAndFillArray(rowCount, int, params),
+        toFloatArray: params => ColumnHelpers.createAndFillArray(rowCount, float, params)
+    }
 }

+ 1 - 1
src/mol-io/reader/cif/text/parser.ts

@@ -507,7 +507,7 @@ async function handleLoop(tokenizer: TokenizerState, ctx: FrameContext): Promise
     const rowCountEstimate = name === '_atom_site' ? (tokenizer.data.length / 100) | 0 : 32;
     const tokens: Tokens[] = [];
     const fieldCount = fieldNames.length;
-    for (let i = 0; i < fieldCount; i++) tokens[i] = TokenBuilder.create(tokenizer, rowCountEstimate);
+    for (let i = 0; i < fieldCount; i++) tokens[i] = TokenBuilder.create(tokenizer.data, rowCountEstimate);
 
     const state: LoopReadState = {
         fieldCount,

+ 6 - 6
src/mol-io/reader/common/text/tokenizer.ts

@@ -109,7 +109,7 @@ namespace Tokenizer {
 
     /** Advance the state by the given number of lines and return line starts/ends as tokens. */
     export function readLines(state: Tokenizer, count: number): Tokens {
-        const lineTokens = TokenBuilder.create(state, count * 2);
+        const lineTokens = TokenBuilder.create(state.data, count * 2);
         readLinesChunk(state, count, lineTokens);
         return lineTokens;
     }
@@ -117,7 +117,7 @@ namespace Tokenizer {
     /** Advance the state by the given number of lines and return line starts/ends as tokens. */
     export async function readLinesAsync(state: Tokenizer, count: number, ctx: RuntimeContext, initialLineCount = 100000): Promise<Tokens> {
         const { length } = state;
-        const lineTokens = TokenBuilder.create(state, count * 2);
+        const lineTokens = TokenBuilder.create(state.data, count * 2);
 
         let linesAlreadyRead = 0;
         await chunkedSubtask(ctx, initialLineCount, state, (chunkSize, state) => {
@@ -132,7 +132,7 @@ namespace Tokenizer {
 
     export function readAllLines(data: string) {
         const state = Tokenizer(data);
-        const tokens = TokenBuilder.create(state, Math.max(data.length / 160, 2))
+        const tokens = TokenBuilder.create(state.data, Math.max(data.length / 80, 2))
         while (markLine(state)) {
             TokenBuilder.add(tokens, state.tokenStart, state.tokenEnd);
         }
@@ -141,7 +141,7 @@ namespace Tokenizer {
 
     export async function readAllLinesAsync(data: string, ctx: RuntimeContext, chunkSize = 100000) {
         const state = Tokenizer(data);
-        const tokens = TokenBuilder.create(state, Math.max(data.length / 160, 2));
+        const tokens = TokenBuilder.create(state.data, Math.max(data.length / 80, 2));
 
         await chunkedSubtask(ctx, chunkSize, state, (chunkSize, state) => {
             readLinesChunk(state, chunkSize, tokens);
@@ -261,10 +261,10 @@ export namespace TokenBuilder {
         tokens.count++;
     }
 
-    export function create(tokenizer: Tokenizer, size: number): Tokens {
+    export function create(data: string, size: number): Tokens {
         size = Math.max(10, size)
         return <Builder>{
-            data: tokenizer.data,
+            data,
             indicesLenMinus2: (size - 2) | 0,
             count: 0,
             offset: 0,

+ 1 - 1
src/mol-io/reader/csv/parser.ts

@@ -231,7 +231,7 @@ function readRecordsChunks(state: State) {
 
 function addColumn (state: State) {
     state.columnNames.push(Tokenizer.getTokenString(state.tokenizer))
-    state.tokens.push(TokenBuilder.create(state.tokenizer, state.data.length / 80))
+    state.tokens.push(TokenBuilder.create(state.tokenizer.data, state.data.length / 80))
 }
 
 function init(state: State) {

+ 15 - 15
src/mol-io/reader/mol2/parser.ts

@@ -130,12 +130,12 @@ async function handleAtoms(state: State): Promise<Schema.Mol2Atoms> {
     }
 
     // required columns
-    const atom_idTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
-    const atom_nameTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
-    const xTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
-    const yTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
-    const zTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
-    const atom_typeTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
+    const atom_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
+    const atom_nameTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
+    const xTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
+    const yTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
+    const zTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
+    const atom_typeTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
 
     const atom_idTokenColumn = TokenColumn(atom_idTokens);
     const atom_nameTokenColumn = TokenColumn(atom_nameTokens);
@@ -145,10 +145,10 @@ async function handleAtoms(state: State): Promise<Schema.Mol2Atoms> {
     const atom_typeColumn = TokenColumn(atom_typeTokens);
 
     // optional columns
-    const subst_idTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
-    const subst_nameTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
-    const chargeTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
-    const status_bitTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2);
+    const subst_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
+    const subst_nameTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
+    const chargeTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
+    const status_bitTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2);
 
     const subst_idTokenColumn = TokenColumn(subst_idTokens);
     const subst_nameTokenColumn = TokenColumn(subst_nameTokens);
@@ -257,10 +257,10 @@ async function handleBonds(state: State): Promise<Schema.Mol2Bonds> {
     }
 
     // required columns
-    const bond_idTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2);
-    const origin_bond_idTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2);
-    const target_bond_idTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2);
-    const bondTypeTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2);
+    const bond_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2);
+    const origin_bond_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2);
+    const target_bond_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2);
+    const bondTypeTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2);
 
     const bond_idTokenColumn = TokenColumn(bond_idTokens);
     const origin_bond_idTokenColumn = TokenColumn(origin_bond_idTokens);
@@ -268,7 +268,7 @@ async function handleBonds(state: State): Promise<Schema.Mol2Bonds> {
     const bondTypeTokenColumn = TokenColumn(bondTypeTokens);
 
     // optional columns
-    const status_bitTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2);
+    const status_bitTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2);
     const status_bitTokenColumn = TokenColumn(status_bitTokens);
     const undefStr = Column.Undefined(molecule.num_bonds, Column.Schema.str);
 

+ 2 - 2
src/mol-io/reader/pdb/parser.ts

@@ -9,6 +9,6 @@ import { Task } from 'mol-task';
 import { ReaderResult } from '../result';
 import { Tokenizer } from '../common/text/tokenizer';
 
-export function parse(data: string): Task<ReaderResult<PdbFile>> {
-    return Task.create('Parse PDB', async ctx => ReaderResult.success({ lines: await Tokenizer.readAllLinesAsync(data, ctx) }));
+export function parse(data: string, id?: string): Task<ReaderResult<PdbFile>> {
+    return Task.create('Parse PDB', async ctx => ReaderResult.success({ id, lines: await Tokenizer.readAllLinesAsync(data, ctx) }));
 }

+ 1 - 0
src/mol-io/reader/pdb/schema.ts

@@ -7,5 +7,6 @@
 import { Tokens } from '../common/text/tokenizer';
 
 export interface PdbFile {
+    id?: string,
     lines: Tokens
 }

+ 4 - 3
src/mol-model/structure/model/format.ts

@@ -7,6 +7,7 @@
 // import { File as GroFile } from 'mol-io/reader/gro/schema'
 import { mmCIF_Database } from 'mol-io/reader/cif/schema/mmcif'
 import CIF, { CifFrame } from 'mol-io/reader/cif';
+import { PdbFile } from 'mol-io/reader/pdb/schema';
 
 type Format =
     // | Format.gro
@@ -15,10 +16,10 @@ type Format =
 namespace Format {
     // export interface gro { kind: 'gro', data: GroFile }
     export interface mmCIF { kind: 'mmCIF', data: mmCIF_Database, frame: CifFrame }
+    export function mmCIF(frame: CifFrame, data?: mmCIF_Database): mmCIF { return { kind: 'mmCIF', data: data || CIF.schema.mmCIF(frame), frame }; }
 
-    export function mmCIF(frame: CifFrame, data?: mmCIF_Database): mmCIF {
-        return { kind: 'mmCIF', data: data || CIF.schema.mmCIF(frame), frame };
-    }
+    export interface PDB { kind: 'PDB', data: PdbFile }
+    export function PDB(data: PdbFile) { return { kind: 'PDB', data }; }
 }
 
 export default Format

+ 269 - 0
src/mol-model/structure/model/formats/pdb.ts

@@ -0,0 +1,269 @@
+/**
+ * Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author David Sehnal <david.sehnal@gmail.com>
+ */
+
+import Format from '../format';
+import { Model } from '../model';
+import { Task } from 'mol-task';
+import { PdbFile } from 'mol-io/reader/pdb/schema';
+import from_mmCIF from './mmcif';
+import { mmCIF_Schema } from 'mol-io/reader/cif/schema/mmcif';
+import { substringStartsWith } from 'mol-util/string';
+import { TokenBuilder, Tokenizer } from 'mol-io/reader/common/text/tokenizer';
+import { CifField, CifCategory } from 'mol-io/reader/cif';
+import CifTextField, { CifTextValueField } from 'mol-io/reader/cif/text/field';
+
+function toCategory(name: string, fields: { [name: string]: CifField | undefined }, rowCount: number): CifCategory {
+    return {
+        name,
+        fieldNames: Object.keys(fields),
+        rowCount,
+        getField(f: string) {
+            return fields[f];
+        }
+    }
+}
+
+function _entity(): { [K in keyof mmCIF_Schema['entity']]?: CifField } {
+    return {
+        id: CifTextValueField(['1', '2', '3']),
+        type: CifTextValueField(['polymer', 'non-polymer', 'water'])
+    }
+}
+
+function atom_site_template(data: string, count: number) {
+    const str = () => new Array(count) as string[];
+    const ts = () => TokenBuilder.create(data, 2 * count);
+    return {
+        count,
+        group_PDB: ts(),
+        id: str(),
+        auth_atom_id: ts(),
+        label_alt_id: ts(),
+        auth_comp_id: ts(),
+        auth_asym_id: ts(),
+        auth_seq_id: ts(),
+        pdbx_PDB_ins_code: ts(),
+        Cartn_x: ts(),
+        Cartn_y: ts(),
+        Cartn_z: ts(),
+        occupancy: ts(),
+        B_iso_or_equiv: ts(),
+        type_symbol: ts(),
+        pdbx_PDB_model_num: str(),
+        label_entity_id: str()
+    };
+}
+
+function _atom_site(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_site']]?: CifField } {
+    const auth_asym_id = CifTextField(sites.auth_asym_id, sites.count);
+    const auth_atom_id = CifTextField(sites.auth_atom_id, sites.count);
+    const auth_comp_id = CifTextField(sites.auth_comp_id, sites.count);
+    const auth_seq_id = CifTextField(sites.auth_seq_id, sites.count);
+
+    return {
+        auth_asym_id,
+        auth_atom_id,
+        auth_comp_id,
+        auth_seq_id,
+        B_iso_or_equiv: CifTextField(sites.B_iso_or_equiv, sites.count),
+        Cartn_x: CifTextField(sites.Cartn_x, sites.count),
+        Cartn_y: CifTextField(sites.Cartn_y, sites.count),
+        Cartn_z: CifTextField(sites.Cartn_z, sites.count),
+        group_PDB: CifTextField(sites.group_PDB, sites.count),
+        id: CifTextValueField(sites.id),
+
+        label_alt_id: CifTextField(sites.label_alt_id, sites.count),
+
+        label_asym_id: auth_asym_id,
+        label_atom_id: auth_atom_id,
+        label_comp_id: auth_comp_id,
+        label_seq_id: auth_seq_id,
+        label_entity_id: CifTextValueField(sites.label_entity_id),
+
+        occupancy: CifTextField(sites.occupancy, sites.count),
+        type_symbol: CifTextField(sites.type_symbol, sites.count),
+
+        pdbx_PDB_ins_code: CifTextField(sites.pdbx_PDB_ins_code, sites.count),
+        pdbx_PDB_model_num: CifTextValueField(sites.pdbx_PDB_model_num)
+    };
+}
+
+function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: number, e: number) {
+    const { data: str } = data;
+    let startPos = s;
+    let start = s;
+    const end = e;
+    const length = end - start;
+
+    // TODO: filter invalid atoms
+
+    // COLUMNS        DATA TYPE       CONTENTS
+    // --------------------------------------------------------------------------------
+    // 1 -  6        Record name     "ATOM  "
+    Tokenizer.trim(data, start, start + 6);
+    TokenBuilder.add(sites.group_PDB, data.tokenStart, data.tokenEnd);
+
+    // 7 - 11        Integer         Atom serial number.
+    // TODO: support HEX
+    start = startPos + 6;
+    Tokenizer.trim(data, start, start + 5);
+    sites.id[sites.id.length] = data.data.substring(data.tokenStart, data.tokenEnd);
+
+    // 13 - 16        Atom            Atom name.
+    start = startPos + 12;
+    Tokenizer.trim(data, start, start + 4);
+    TokenBuilder.add(sites.auth_atom_id, data.tokenStart, data.tokenEnd);
+
+    // 17             Character       Alternate location indicator.
+    if (str.charCodeAt(startPos + 16) === 32) { // ' '
+        TokenBuilder.add(sites.label_alt_id, 0, 0);
+    } else {
+        TokenBuilder.add(sites.label_alt_id, startPos + 16, startPos + 17);
+    }
+
+    // 18 - 20        Residue name    Residue name.
+    start = startPos + 17;
+    Tokenizer.trim(data, start, start + 3);
+    TokenBuilder.add(sites.auth_comp_id, data.tokenStart, data.tokenEnd);
+
+    // 22             Character       Chain identifier.
+    TokenBuilder.add(sites.auth_asym_id, startPos + 21, startPos + 22);
+
+    // 23 - 26        Integer         Residue sequence number.
+    // TODO: support HEX
+    start = startPos + 22;
+    Tokenizer.trim(data, start, start + 4);
+    TokenBuilder.add(sites.auth_seq_id, data.tokenStart, data.tokenEnd);
+
+    // 27             AChar           Code for insertion of residues.
+    if (str.charCodeAt(startPos + 26) === 32) { // ' '
+        TokenBuilder.add(sites.label_alt_id, 0, 0);
+    } else {
+        TokenBuilder.add(sites.label_alt_id, startPos + 26, startPos + 27);
+    }
+
+    // 31 - 38        Real(8.3)       Orthogonal coordinates for X in Angstroms.
+    start = startPos + 30;
+    Tokenizer.trim(data, start, start + 8);
+    TokenBuilder.add(sites.Cartn_x, data.tokenStart, data.tokenEnd);
+
+    // 39 - 46        Real(8.3)       Orthogonal coordinates for Y in Angstroms.
+    start = startPos + 38;
+    Tokenizer.trim(data, start, start + 8);
+    TokenBuilder.add(sites.Cartn_y, data.tokenStart, data.tokenEnd);
+
+    // 47 - 54        Real(8.3)       Orthogonal coordinates for Z in Angstroms.
+    start = startPos + 46;
+    Tokenizer.trim(data, start, start + 8);
+    TokenBuilder.add(sites.Cartn_z, data.tokenStart, data.tokenEnd);
+
+    // 55 - 60        Real(6.2)       Occupancy.
+    start = startPos + 54;
+    Tokenizer.trim(data, start, start + 6);
+    TokenBuilder.add(sites.occupancy, data.tokenStart, data.tokenEnd);
+
+    // 61 - 66        Real(6.2)       Temperature factor (Default = 0.0).
+    if (length >= 66) {
+        start = startPos + 60;
+        Tokenizer.trim(data, start, start + 6);
+        TokenBuilder.add(sites.B_iso_or_equiv, data.tokenStart, data.tokenEnd);
+    } else {
+        TokenBuilder.add(sites.label_alt_id, 0, 0);
+    }
+
+    // 73 - 76        LString(4)      Segment identifier, left-justified.
+    // ignored
+
+    // 77 - 78        LString(2)      Element symbol, right-justified.
+    if (length >= 78) {
+        start = startPos + 76;
+        Tokenizer.trim(data, start, start + 2);
+
+        if (data.tokenStart < data.tokenEnd) {
+            TokenBuilder.add(sites.type_symbol, data.tokenStart, data.tokenEnd);
+        } else {
+            // "guess" the symbol
+            TokenBuilder.add(sites.type_symbol, startPos + 12, startPos + 13);
+        }
+    } else {
+        TokenBuilder.add(sites.type_symbol, startPos + 12, startPos + 13);
+    }
+
+    // TODO
+    sites.label_entity_id.push('1');
+    sites.pdbx_PDB_model_num.push(model);
+
+}
+
+type AtomSiteTemplate = typeof atom_site_template extends (...args: any) => infer T ? T : never
+
+async function pdbToMmCIF(pdb: PdbFile): Promise<Format.mmCIF> {
+    const { lines } = pdb;
+    const { data, indices } = lines;
+    const tokenizer = Tokenizer(data);
+
+    // Count the atoms
+    let atomCount = 0;
+    for (let i = 0, _i = lines.count; i < _i; i++) {
+        const s = indices[2 * i], e = indices[2 * i + 1];
+        switch (data[s]) {
+            case 'A':
+                if (substringStartsWith(data, s, e, 'ATOM  ')) atomCount++;
+                break;
+            case 'H':
+                if (!substringStartsWith(data, s, e, 'HETATM')) atomCount++;
+                break;
+        }
+    }
+
+    const atom_site = atom_site_template(data, atomCount);
+
+    let modelNum = 0, modelStr = '';
+
+    for (let i = 0, _i = lines.count; i < _i; i++) {
+        const s = indices[2 * i], e = indices[2 * i + 1];
+        switch (data[s]) {
+            case 'A':
+                if (!substringStartsWith(data, s, e, 'ATOM  ')) continue;
+                if (!modelNum) { modelNum++; modelStr = '' + modelNum; }
+                addAtom(atom_site, modelStr, tokenizer, s, e);
+                break;
+            case 'H':
+                if (!substringStartsWith(data, s, e, 'HETATM')) continue;
+                if (!modelNum) { modelNum++; modelStr = '' + modelNum; }
+                addAtom(atom_site, modelStr, tokenizer, s, e);
+                break;
+            case 'M':
+                if (substringStartsWith(data, s, e, 'MODEL ')) {
+                    modelNum++;
+                    modelStr = '' + modelNum;
+                }
+                break;
+
+        }
+    }
+
+    const categories = {
+        entity: toCategory('entity', _entity(), 3),
+        atom_site: toCategory('atom_site', _atom_site(atom_site), atomCount)
+    }
+
+    return Format.mmCIF({
+        header: pdb.id || 'PDB',
+        categoryNames: Object.keys(categories),
+        categories
+    });
+}
+
+function buildModels(format: Format.PDB): Task<ReadonlyArray<Model>> {
+    return Task.create('Create PDB Model', async ctx => {
+        await ctx.update('Converting to mmCIF...');
+        const cif = await pdbToMmCIF(format.data);
+        return from_mmCIF(cif).runInContext(ctx);
+    });
+}
+
+export default buildModels;

+ 9 - 0
src/mol-util/string.ts

@@ -37,4 +37,13 @@ export function snakeCaseToWords(str: string) {
 
 export function stringToWords(str: string) {
     return capitalize(splitCamelCase(splitSnakeCase(str)))
+}
+
+export function substringStartsWith(str: string, start: number, end: number, target: string) {
+    let len = target.length;
+    if (len > end - start) return false;
+    for (let i = 0; i < len; i++) {
+        if (str.charCodeAt(start + i) !== target.charCodeAt(i)) return false;
+    }
+    return true;
 }