فهرست منبع

Merge pull request #231 from molstar/sdf-parser-improvements

Sdf parser improvements
David Sehnal 3 سال پیش
والد
کامیت
2df145aa8f

+ 1 - 0
CHANGELOG.md

@@ -7,6 +7,7 @@ Note that since we don't clearly distinguish between a public and private interf
 ## [Unreleased]
 
 - Add `tubularHelices` parameter to Cartoon representation
+- Add `SdfFormat` and update SDF parser to be able to parse data headers according to spec (hopefully :)) #230
 
 ## [v2.1.0] - 2021-07-05
 

+ 10 - 7
src/mol-io/reader/_spec/sdf.spec.ts

@@ -22,8 +22,8 @@ M  END
 > <DATABASE_NAME>
 drugbank
 
-> <SMILES>
-[O-]P([O-])([O-])=O
+> 5225 <TEST_FIELD>
+whatever
 
 > <INCHI_IDENTIFIER>
 InChI=1S/H3O4P/c1-5(2,3)4/h(H3,1,2,3,4)/p-3
@@ -362,22 +362,25 @@ describe('sdf reader', () => {
         expect(bonds.atomIdxB.value(3)).toBe(5);
         expect(bonds.order.value(3)).toBe(1);
 
-        expect(dataItems.dataHeader.value(0)).toBe('DATABASE_ID');
+        expect(dataItems.dataHeader.value(0)).toBe('<DATABASE_ID>');
         expect(dataItems.data.value(0)).toBe('0');
 
-        expect(dataItems.dataHeader.value(1)).toBe('DATABASE_NAME');
+        expect(dataItems.dataHeader.value(1)).toBe('<DATABASE_NAME>');
         expect(dataItems.data.value(1)).toBe('drugbank');
 
-        expect(dataItems.dataHeader.value(31)).toBe('SYNONYMS');
+        expect(dataItems.dataHeader.value(2)).toBe('5225 <TEST_FIELD>');
+        expect(dataItems.data.value(2)).toBe('whatever');
+
+        expect(dataItems.dataHeader.value(31)).toBe('<SYNONYMS>');
         expect(dataItems.data.value(31)).toBe('Orthophosphate; Phosphate');
 
         expect(compound1.dataItems.data.value(0)).toBe('0');
         expect(compound2.dataItems.data.value(0)).toBe('1');
 
-        expect(compound3.dataItems.dataHeader.value(2)).toBe('PUBCHEM_CONFORMER_DIVERSEORDER');
+        expect(compound3.dataItems.dataHeader.value(2)).toBe('<PUBCHEM_CONFORMER_DIVERSEORDER>');
         expect(compound3.dataItems.data.value(2)).toBe('1\n11\n10\n3\n15\n17\n13\n5\n16\n7\n14\n9\n8\n4\n18\n6\n12\n2');
 
-        expect(compound3.dataItems.dataHeader.value(21)).toBe('PUBCHEM_COORDINATE_TYPE');
+        expect(compound3.dataItems.dataHeader.value(21)).toBe('<PUBCHEM_COORDINATE_TYPE>');
         expect(compound3.dataItems.data.value(21)).toBe('2\n5\n10');
     });
 });

+ 14 - 10
src/mol-io/reader/sdf/parser.ts

@@ -13,16 +13,20 @@ import { Tokenizer, TokenBuilder } from '../common/text/tokenizer';
 import { TokenColumnProvider as TokenColumn } from '../common/text/column/token';
 
 /** http://c4.cabrillo.edu/404/ctfile.pdf - page 41 */
+
+export interface SdfFileCompound {
+    readonly molFile: MolFile,
+    readonly dataItems: {
+        readonly dataHeader: Column<string>,
+        readonly data: Column<string>
+    }
+}
+
 export interface SdfFile {
-    readonly compounds: {
-        readonly molFile: MolFile,
-        readonly dataItems: {
-            readonly dataHeader: Column<string>,
-            readonly data: Column<string>
-        }
-    }[]
+    readonly compounds: SdfFileCompound[]
 }
 
+
 const delimiter = '$$$$';
 function handleDataItems(tokenizer: Tokenizer): { dataHeader: Column<string>, data: Column<string> } {
     const dataHeader = TokenBuilder.create(tokenizer.data, 32);
@@ -33,8 +37,8 @@ function handleDataItems(tokenizer: Tokenizer): { dataHeader: Column<string>, da
         if (line.startsWith(delimiter)) break;
         if (!line) continue;
 
-        if (line.startsWith('> <')) {
-            TokenBuilder.add(dataHeader, tokenizer.tokenStart + 3, tokenizer.tokenEnd - 1);
+        if (line.startsWith('> ')) {
+            TokenBuilder.add(dataHeader, tokenizer.tokenStart + 2, tokenizer.tokenEnd);
 
             Tokenizer.markLine(tokenizer);
             const start = tokenizer.tokenStart;
@@ -42,7 +46,7 @@ function handleDataItems(tokenizer: Tokenizer): { dataHeader: Column<string>, da
             let added = false;
             while (tokenizer.position < tokenizer.length) {
                 const line2 = Tokenizer.readLine(tokenizer);
-                if (!line2 || line2.startsWith(delimiter) || line2.startsWith('> <')) {
+                if (!line2 || line2.startsWith(delimiter) || line2.startsWith('> ')) {
                     TokenBuilder.add(data, start, end);
                     added = true;
                     break;

+ 3 - 3
src/mol-model-formats/structure/mol.ts

@@ -17,7 +17,7 @@ import { ModelFormat } from '../format';
 import { IndexPairBonds } from './property/bonds/index-pair';
 import { Trajectory } from '../../mol-model/structure';
 
-async function getModels(mol: MolFile, ctx: RuntimeContext) {
+export async function getMolModels(mol: MolFile, format: ModelFormat<any> | undefined, ctx: RuntimeContext) {
     const { atoms, bonds } = mol;
 
     const MOL = Column.ofConst('MOL', mol.atoms.count, Column.Schema.str);
@@ -61,7 +61,7 @@ async function getModels(mol: MolFile, ctx: RuntimeContext) {
         atom_site
     });
 
-    const models = await createModels(basics, MolFormat.create(mol), ctx);
+    const models = await createModels(basics, format ?? MolFormat.create(mol), ctx);
 
     if (models.frameCount > 0) {
         const indexA = Column.ofIntArray(Column.mapToArray(bonds.atomIdxA, x => x - 1, Int32Array));
@@ -91,5 +91,5 @@ namespace MolFormat {
 }
 
 export function trajectoryFromMol(mol: MolFile): Task<Trajectory> {
-    return Task.create('Parse MOL', ctx => getModels(mol, ctx));
+    return Task.create('Parse MOL', ctx => getMolModels(mol, void 0, ctx));
 }

+ 29 - 0
src/mol-model-formats/structure/sdf.ts

@@ -0,0 +1,29 @@
+/**
+ * Copyright (c) 2021 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author David Sehnal <david.sehnal@gmail.com>
+ */
+
+import { SdfFileCompound } from '../../mol-io/reader/sdf/parser';
+import { Trajectory } from '../../mol-model/structure';
+import { Task } from '../../mol-task';
+import { ModelFormat } from '../format';
+import { getMolModels } from './mol';
+
+export { SdfFormat };
+
+type SdfFormat = ModelFormat<SdfFileCompound>
+
+namespace SdfFormat {
+    export function is(x?: ModelFormat): x is SdfFormat {
+        return x?.kind === 'sdf';
+    }
+
+    export function create(mol: SdfFileCompound): SdfFormat {
+        return { kind: 'sdf', name: mol.molFile.title, data: mol };
+    }
+}
+
+export function trajectoryFromSdf(mol: SdfFileCompound): Task<Trajectory> {
+    return Task.create('Parse SDF', ctx => getMolModels(mol.molFile, SdfFormat.create(mol), ctx));
+}

+ 3 - 2
src/mol-plugin-state/transforms/model.ts

@@ -40,6 +40,7 @@ import { coordinatesFromXtc } from '../../mol-model-formats/structure/xtc';
 import { parseXyz } from '../../mol-io/reader/xyz/parser';
 import { trajectoryFromXyz } from '../../mol-model-formats/structure/xyz';
 import { parseSdf } from '../../mol-io/reader/sdf/parser';
+import { trajectoryFromSdf } from '../../mol-model-formats/structure/sdf';
 
 export { CoordinatesFromDcd };
 export { CoordinatesFromXtc };
@@ -308,8 +309,8 @@ const TrajectoryFromSDF = PluginStateTransform.BuiltIn({
 
             const models: Model[] = [];
 
-            for (const { molFile } of parsed.result.compounds) {
-                const traj = await trajectoryFromMol(molFile).runInContext(ctx);
+            for (const compound of parsed.result.compounds) {
+                const traj = await trajectoryFromSdf(compound).runInContext(ctx);
                 for (let i = 0; i < traj.frameCount; i++) {
                     models.push(await Task.resolveInContext(traj.getFrameAtIndex(i), ctx));
                 }